In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [63]:
# load data
train = pd.read_csv('tweets_train.csv')
test = pd.read_csv('tweets_test.csv')


In [64]:
# preprocessing
test.fillna(test.median())
test = test.drop(['handle', 'in_reply_to_screen_name'],axis=1) 
test['original_author'] = test['original_author'].fillna('')
print(test.isnull().sum())

id                 0
text               0
is_retweet         0
original_author    0
time               0
is_quote_status    0
lang               0
retweet_count      0
favorite_count     0
dtype: int64


In [65]:
train.head()

Unnamed: 0,id,handle,text,is_retweet,original_author,time,lang,retweet_count,favorite_count
0,7.81e+17,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,en,218,651
1,7.81e+17,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,en,2445,5308
2,7.81e+17,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,en,7834,27234
3,7.81e+17,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,en,916,2542
4,7.81e+17,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,en,859,2882


In [66]:
# map hanle to category
def allocate_category(handle):
    if (handle == 'HillaryClinton'):
        return 0
    else:
        return 1
train['category'] = train['handle'].map(lambda handle: allocate_category(handle))

In [69]:
# feature extraction
count_vect = CountVectorizer()
count_vect.fit(train.text)    
tf_transformer = TfidfTransformer(use_idf=False).fit(count_vect.transform(train.text))
# using both textual and non-textual features
def get_manual_features(df):
    df.fillna(df.mean())
    res = pd.DataFrame()    
    res.reindex(df.index)    
    res['@count'] = df['text'].apply(lambda row: row.count('@'))
    res['#count'] = df['text'].apply(lambda row: row.count('#'))
    res['.count'] = df['text'].apply(lambda row: row.count('.'))
    res[',count'] = df['text'].apply(lambda row: row.count(','))
    res['retweet_count'] = df['retweet_count']
    res['hilary_supporter'] = df['original_author'].apply(lambda row: row == 'HFA' or row == 'TheBriefing2016')
    X_train_counts = count_vect.transform(df.text)    
    X_train_tf = tf_transformer.transform(X_train_counts)
    new = pd.DataFrame(columns=list(range(X_train_tf.shape[1])), index=list(range(res.shape[0])))
    for i, row in enumerate(X_train_tf):
        for col, j in enumerate(row.indices):
            new.loc[i, j] = row.data[col]
    new.set_index(res.index, inplace=True)
    new = new.fillna(0)
    #pd.SparseDataFrame([ pd.SparseSeries(X_train_tf[i].toarray().ravel()) 
#                              for i in np.arange(X_train_tf.shape[0]) ])
    res = pd.concat([res,new],axis=1)
    return res
features = get_manual_features(train);
print(features.head())

   @count  #count  .count  ,count  retweet_count hilary_supporter    0    1  \
0       0       0       1       0            218            False  0.0  0.0   
1       0       0       3       1           2445            False  0.0  0.0   
2       2       0       2       0           7834            False  0.0  0.0   
3       0       0       3       1            916            False  0.0  0.0   
4       0       0       3       0            859            False  0.0  0.0   

     2    3  ...   9344  9345  9346  9347  9348  9349  9350  9351  9352  9353  
0  0.0  0.0  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1  0.0  0.0  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2  0.0  0.0  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3  0.0  0.0  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4  0.0  0.0  ...    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 9360 columns]


In [70]:
# expirimenting with different values of alpha
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, .6, .7 ,.8 ,.9, 1, 2, 10]

In [75]:
# use cross-validation to approximate classifier generallity error
for a in alphas:
    complex_clf = MultinomialNB(alpha=a)
    scores = cross_val_score(complex_clf, features, train.category, cv=10)
    print('alpha is {} and average cross-validation error on train-set is: {}'.format(a, scores.mean()))

alpha is 0.1 and average cross-validation error on train-set is: 0.887217476359
alpha is 0.2 and average cross-validation error on train-set is: 0.884468084176
alpha is 0.3 and average cross-validation error on train-set is: 0.88196994825
alpha is 0.4 and average cross-validation error on train-set is: 0.881474945156
alpha is 0.5 and average cross-validation error on train-set is: 0.880469942
alpha is 0.6 and average cross-validation error on train-set is: 0.87897243889
alpha is 0.7 and average cross-validation error on train-set is: 0.877226806105
alpha is 0.8 and average cross-validation error on train-set is: 0.875978676429
alpha is 0.9 and average cross-validation error on train-set is: 0.874727420171
alpha is 1 and average cross-validation error on train-set is: 0.872979287371
alpha is 2 and average cross-validation error on train-set is: 0.855491070257
alpha is 10 and average cross-validation error on train-set is: 0.671002598454


In [17]:
# small values of alpha seems to work better, continuing exploring
alphas = [0.00001, 0.001]
# use cross-validation to approximate classifier generallity error
for a in alphas:
    complex_clf = MultinomialNB(alpha=a).fit(features, train.category)
    scores = cross_val_score(complex_clf, features, train.category, cv=10)
    print('alpha is {} and average cross-validation error on train-set is: {}'.format(a, scores.mean()))

alpha is 1e-05 and average cross-validation error on train-set is: 0.899964421653
alpha is 0.001 and average cross-validation error on train-set is: 0.898960668504


In [18]:
a=0.001 # we use this value to avoid overfitting although as it seems, smaller values work better
complex_clf = MultinomialNB(alpha=a)
scores = cross_val_score(complex_clf, features, train.category, cv=10)
print('average cross-validation error on train-set is: {}'.format(scores.mean()))

average cross-validation error on train-set is: 0.898960668504


Trying without fitting the prior just staying with uniform one

In [21]:
# experimenting different types of alphas with non-fitting for prior variation
alphas = [0.00005, 0.0001, 0.001, 0.1, 0.3, 0.8]
for a in alphas:
    complex_clf = MultinomialNB(alpha=a, fit_prior=False)
    scores = cross_val_score(complex_clf, features, train.category, cv=10)
    print('alpha is {} and average cross-validation error on train-set is: {}'.format(a, scores.mean()))

alpha is 5e-05 and average cross-validation error on train-set is: 0.901968810743
alpha is 0.0001 and average cross-validation error on train-set is: 0.903468196676
alpha is 0.001 and average cross-validation error on train-set is: 0.899469421684
alpha is 0.1 and average cross-validation error on train-set is: 0.885723099832
alpha is 0.3 and average cross-validation error on train-set is: 0.880980581129
alpha is 0.8 and average cross-validation error on train-set is: 0.87123115457


selected model comprised of: alpha = 0.0001, no fitting for prior

In [72]:
# predicting results
complex_clf = MultinomialNB(alpha=0.0001, fit_prior=False).fit(features,train.category)
test_features = get_manual_features(test)

In [73]:
# validating features
test_features.isnull().sum()

@count              0
#count              0
.count              0
,count              0
retweet_count       0
hilary_supporter    0
0                   0
1                   0
2                   0
3                   0
4                   0
5                   0
6                   0
7                   0
8                   0
9                   0
10                  0
11                  0
12                  0
13                  0
14                  0
15                  0
16                  0
17                  0
18                  0
19                  0
20                  0
21                  0
22                  0
23                  0
                   ..
9324                0
9325                0
9326                0
9327                0
9328                0
9329                0
9330                0
9331                0
9332                0
9333                0
9334                0
9335                0
9336                0
9337                0
9338      

In [74]:
# predicting handle v alues
test['category'] = complex_clf.predict(test_features)
def category_2_candidate(category):
    if (category == 0):
        return 'HillaryClinton'
    else:
        return 'realDonaldTrump'
test['handle'] = test['category'].map(lambda category: category_2_candidate(category))

In [76]:
# saving results
del test['category'] # deleting helper coloumn 
test.to_csv('Complex_tweets_test.csv',encoding='utf8',index = False)

In [77]:
# sainity check for results
test_from_save = pd.read_csv('Complex_tweets_test.csv')
test_from_save.head()

Unnamed: 0,id,text,is_retweet,original_author,time,is_quote_status,lang,retweet_count,favorite_count,handle
0,7.34e+17,#MichaelBrown would have been 20 years old tod...,True,LSD_Esq,2016-05-20T18:07:08,False,en,594,1096,HillaryClinton
1,7.34e+17,"Congratulations on becoming a U.S. citizen, Al...",False,,2016-05-20T17:24:12,False,en,1701,4239,HillaryClinton
2,7.34e+17,We need a president who will unite leaders aro...,False,,2016-05-20T17:12:52,False,en,1817,3577,HillaryClinton
3,7.34e+17,"Dear Congress,\r\n\r\nLet's get this done.\r\n...",False,,2016-05-20T16:21:13,False,en,2530,6012,HillaryClinton
4,7.34e+17,Failing @NYTimes will always take a good story...,False,,2016-05-20T16:11:21,False,en,3750,12372,realDonaldTrump
