In [31]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.naive_bayes import ComplementNB
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
train = pd.read_csv('train_label.csv')
dev = pd.read_csv('dev_label.csv')
test = pd.read_csv('test.csv')

In [3]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'http\S+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
all_train_text = []
train_replies_len = []
for item in train.text:
    item = item.split(',')
    source = process_tweet(item[0])
    source = ' '.join(source)
    all_train_text.append(source)
    train_replies_len.append(1)
    replies = [process_tweet(t) for t in item[1:]]
    replies = [' '.join(t) for t in replies]
    reply_length = len(replies)
    train_replies_len.append(reply_length)
    replies = ' '.join(replies)
    all_train_text.append(replies)

In [5]:
len(all_train_text)

3140

In [6]:
len(train_replies_len)

3140

In [7]:
all_dev_text = []
dev_replies_len = []
for item in dev.text:
    item = item.split(',')
    source = process_tweet(item[0])
    source = ' '.join(source)
    all_dev_text.append(source)
    dev_replies_len.append(1)
    replies = [process_tweet(t) for t in item[1:]]
    replies = [' '.join(t) for t in replies]
    reply_length = len(replies)
    dev_replies_len.append(reply_length)
    replies = ' '.join(replies)
    all_dev_text.append(replies)

In [8]:
len(all_dev_text)

1074

In [9]:
len(dev_replies_len)

1074

In [10]:
all_test_text = []
test_replies_len = []
for item in test.text:
    item = item.split(',')
    source = process_tweet(item[0])
    source = ' '.join(source)
    all_test_text.append(source)
    test_replies_len.append(1)
    replies = [process_tweet(t) for t in item[1:]]
    replies = [' '.join(t) for t in replies]
    reply_length = len(replies)
    test_replies_len.append(reply_length)
    replies = ' '.join(replies)
    all_test_text.append(replies)

In [11]:
len(all_test_text)

1116

In [12]:
len(test_replies_len)

1116

In [13]:
tf_idf_vectorizer = TfidfVectorizer(tokenizer = TweetTokenizer().tokenize, max_df = 0.8, min_df = 3, max_features = 6000)

train_tfidf_matrix = tf_idf_vectorizer.fit_transform(all_train_text)

dev_tfidf_matrix = tf_idf_vectorizer.transform(all_dev_text)

test_tfidf_matrix = tf_idf_vectorizer.transform(all_test_text)

In [14]:
train_tfidf_matrix.shape

(3140, 4926)

In [15]:
dev_tfidf_matrix.shape

(1074, 4926)

In [16]:
test_tfidf_matrix.shape

(1116, 4926)

In [17]:
train = train.drop(['row', 'text', 'profile_description'], axis = 1).round(4)
dev = dev.drop(['row', 'text', 'profile_description'], axis = 1).round(4)
test = test.drop(['row', 'text', 'profile_description'], axis = 1).round(4)

In [18]:
def construct(dataset, replies_length, ftidfmatrix):

    feature_vec = []
    for i in range(0, ftidfmatrix.shape[0] - 1 , 2):
        source_text_features = ftidfmatrix[i].toarray()
        reply_text_features = ftidfmatrix[i + 1].toarray()/replies_length[i + 1]
        text_features = np.squeeze(source_text_features + reply_text_features)

        feature_vec.append(text_features)

    text = pd.DataFrame(feature_vec)
    data = pd.concat([dataset, text], axis = 1)
    
    return data

In [19]:
train = construct(train, train_replies_len, train_tfidf_matrix)
dev = construct(dev, dev_replies_len, dev_tfidf_matrix)
test = construct(test, test_replies_len, test_tfidf_matrix)

  reply_text_features = ftidfmatrix[i + 1].toarray()/replies_length[i + 1]


In [20]:
train = train.fillna(0)
dev = dev.fillna(0)
test = test.fillna(0)

In [21]:
train.to_csv('train_data.csv')
dev.to_csv('dev_data.csv')
test.to_csv('test_data.csv')

In [22]:
user_train = train.iloc[:, :6]
user_dev = dev.iloc[:, :6]
user_test = test.iloc[:, :6]

In [23]:
minmax = MinMaxScaler()
user_train = minmax.fit_transform(user_train)
user_dev = minmax.transform(user_dev)
user_test = minmax.transform(user_test)

In [24]:
user_train = pd.DataFrame(user_train)
train_scale = pd.concat([user_train, train.iloc[:, 6:]], axis = 1)

In [25]:
user_dev = pd.DataFrame(user_dev)
dev_scale = pd.concat([user_dev, dev.iloc[:, 6:]], axis = 1)

In [26]:
user_test = pd.DataFrame(user_test)
test_scale = pd.concat([user_test, test.iloc[:, 6:]], axis = 1)

In [27]:
train_scale.to_csv('train_minmax.csv')
dev_scale.to_csv('dev_minmax.csv')
test_scale.to_csv('test_minmax.csv')

In [28]:
train_x = train_scale.drop('label', axis = 1)
train_y = train_scale.label

In [29]:
dev_x = dev_scale.drop('label', axis = 1)
dev_y = dev_scale.label

In [32]:
x_sm_train, y_sm_train = SMOTE().fit_resample(np.array(train_x), train_y)
x_sm_dev, y_sm_dev = SMOTE().fit_resample(np.array(dev_x), dev_y)

In [34]:
alphas = np.logspace(-3, 1, 5)

for a in alphas:
    comp_nb = ComplementNB(alpha = a)
    comp_nb.fit(x_sm_train, y_sm_train)
    pred = comp_nb.predict(x_sm_dev)
    print(a, classification_report(y_sm_dev, pred))

0.001               precision    recall  f1-score   support

           0       0.86      0.95      0.90       422
           1       0.94      0.85      0.89       422

    accuracy                           0.90       844
   macro avg       0.90      0.90      0.90       844
weighted avg       0.90      0.90      0.90       844

0.01               precision    recall  f1-score   support

           0       0.89      0.94      0.91       422
           1       0.93      0.88      0.90       422

    accuracy                           0.91       844
   macro avg       0.91      0.91      0.91       844
weighted avg       0.91      0.91      0.91       844

0.1               precision    recall  f1-score   support

           0       0.93      0.91      0.92       422
           1       0.92      0.93      0.92       422

    accuracy                           0.92       844
   macro avg       0.92      0.92      0.92       844
weighted avg       0.92      0.92      0.92       844

1.0 

In [38]:
y_pred = comp_nb.predict(test_scale)
results = pd.DataFrame({'Predicted' : y_pred})
results.index.name = 'Id'
results.to_csv('test_results.csv')

In [39]:
x_sm_train, y_sm_train = SMOTE().fit_resample(np.array(train_x), train_y)
x_sm_dev, y_sm_dev = SMOTE().fit_resample(np.array(dev_x), dev_y)

In [34]:
comp_nb = ComplementNB()
comp_nb.fit(x_sm_train, y_sm_train)
pred = comp_nb.predict(x_sm_dev)
print(classification_report(y_sm_dev, pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90       422
           1       0.87      0.95      0.91       422

    accuracy                           0.91       844
   macro avg       0.91      0.91      0.91       844
weighted avg       0.91      0.91      0.91       844



In [35]:
y_pred = comp_nb.predict(test_scale)
results = pd.DataFrame({'Predicted' : y_pred})
results.index.name = 'Id'
results.to_csv('test_results.csv')

In [44]:
cs = np.logspace(-3, 1, 5)
gammas = np.logspace(-3, 1, 5)

for c in cs:
    for gamma in gammas:
        model = svm.SVC(kernel = 'rbf', gamma = gamma, C = c, class_weight = 'balanced').fit(x_sm_train, y_sm_train)
        train_score = accuracy_score (y_sm_dev, model.predict(x_sm_dev))
        
        print(f'C = {c}, gamma = {gamma}, train acc = {train_score}')
        y_valid = model.predict(dev_x)
        valid_score = accuracy_score(dev_y, y_valid)
        print(f'valid acc = {valid_score}')
        
        print(classification_report(dev_y, y_valid))

C = 0.001, gamma = 0.001, train acc = 0.5829383886255924
valid acc = 0.5754189944134078
              precision    recall  f1-score   support

           0       0.83      0.58      0.68       422
           1       0.27      0.57      0.36       115

    accuracy                           0.58       537
   macro avg       0.55      0.57      0.52       537
weighted avg       0.71      0.58      0.61       537

C = 0.001, gamma = 0.01, train acc = 0.5853080568720379
valid acc = 0.5754189944134078
              precision    recall  f1-score   support

           0       0.83      0.58      0.68       422
           1       0.27      0.57      0.36       115

    accuracy                           0.58       537
   macro avg       0.55      0.57      0.52       537
weighted avg       0.71      0.58      0.61       537

C = 0.001, gamma = 0.1, train acc = 0.5924170616113744
valid acc = 0.5754189944134078
              precision    recall  f1-score   support

           0       0.83      0

In [41]:
model = svm.SVC(gamma = 0.1, C = 10, class_weight = 'balanced').fit(x_sm_train, y_sm_train)

In [43]:
y_pred = model.predict(test_scale)
results = pd.DataFrame({'Predicted' : y_pred})
results.index.name = 'Id'
results.to_csv('test_results.csv')