In [122]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import pickle
import tweepy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [123]:
rf_df = pd.read_csv('data/full/rf_preproc_train.csv')

In [124]:
rf_df = rf_df[['user_id', 'avg_fav', 'avg_hash', 'avg_len', 'avg_ret', 'default_profile',
       'fake_words_score', 'favourites_count', 'followers_count', 'freq',
       'friends_count', 'genuine_words_score', 'listed_count', 'max_fav',
       'max_hash', 'max_len', 'max_ret', 'media_perc', 'min_fav', 'min_hash',
       'min_len', 'min_ret', 'porn_words_score',
       'profile_use_background_image', 'prop_words_score', 'quote_perc',
       'ret_perc', 'spam_words_score', 'statuses_count',
       'tweet_intradistance', 'url', 'url_intradistance', 'url_perc',
       'description_len', 'name_len', 'screen_name_len', 'age','target', 'nsfw_avg', 'nsfw_profile']]

In [125]:
X_train, X_val, y_train, y_val = train_test_split(rf_df.drop(columns=['target']), rf_df['target'], test_size=0.2)

In [126]:
rf = RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=28)

In [127]:
cv = cross_validate(rf, X_train.drop(columns=['user_id']), y_train, cv=10, scoring='f1_macro')

In [128]:
cv['test_score'].mean()

0.9616976914856987

In [129]:
rf.fit(X_train.drop(columns=['user_id']), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=28, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [130]:
rf_pred = rf.predict_proba(X_val.drop(columns=['user_id']))

In [131]:
rf.predict_proba(X_val.drop(columns=['user_id']).iloc[0:1])

array([[0.075, 0.05 , 0.025, 0.025, 0.825]])

In [132]:
rf_prob = pd.DataFrame(rf_pred, columns=['rf_0', 'rf_1', 'rf_2', 'rf_3', 'rf_4'])

In [133]:
X_val.reset_index(inplace=True, drop=True)

In [134]:
rf_prob.reset_index(inplace=True, drop=True)

In [135]:
y_val.reset_index(inplace=True, drop=True)

In [136]:
rf_prob['user_id'] = X_val['user_id']

In [137]:
rf_prob['target'] = y_val

In [138]:
rf_prob.head()

Unnamed: 0,rf_0,rf_1,rf_2,rf_3,rf_4,user_id,target
0,0.075,0.05,0.025,0.025,0.825,1605070976,2
1,1.0,0.0,0.0,0.0,0.0,1383685152,0
2,0.0,0.0,0.0,1.0,0.0,1176422048,3
3,0.0,0.0,0.0,1.0,0.0,1004089480356167685,3
4,0.0,1.0,0.0,0.0,0.0,789172980676857856,1


## BotOrNot

In [139]:
model = pickle.load(open("data/bot_or_not/model.model", "rb"))

In [140]:
X_val.drop(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'genuine_words_score'], inplace=True)

In [141]:
y_bon = y_val

In [142]:
y_bon[:13]

0     2
1     0
2     3
3     3
4     1
5     0
6     3
7     2
8     2
9     0
10    4
11    3
12    0
Name: target, dtype: int64

In [143]:
y_bon[y_bon != 4] = 1
y_bon[y_bon == 4] = 0

In [144]:
X_val.drop(columns=['nsfw_profile', 'nsfw_avg'], inplace=True)
X_train.drop(columns=['nsfw_profile', 'nsfw_avg'], inplace=True)

In [145]:
bon_pred = model.predict_proba(X_val.drop(columns=['user_id']))

In [146]:
bon_pred

array([[0.88571429, 0.11428571],
       [0.        , 1.        ],
       [0.        , 1.        ],
       ...,
       [0.31428571, 0.68571429],
       [0.        , 1.        ],
       [0.4       , 0.6       ]])

In [147]:
bon_prob = pd.DataFrame(bon_pred, columns=['bon_4', 'bon_3'])

bon_prob['bon_3'] = bon_pred.astype(int)

bon_prob['bon_4'] = np.logical_not(bon_pred).astype(int)

In [148]:
bon_prob['bon_0'] = 0
bon_prob['bon_1'] = 0
bon_prob['bon_2'] = 0

In [149]:
y_pred = model.predict(X_val.drop(columns=['user_id']))

In [150]:
print(pd.crosstab(y_bon, y_pred , rownames=['Actual Target'], colnames=['Predicted Target']))

Predicted Target  0.0   1.0
Actual Target              
0                 738     8
1                 248  4028


In [151]:
bon_prob['bon_0'] = bon_prob['bon_3']/4
bon_prob['bon_1'] = bon_prob['bon_3']/4
bon_prob['bon_2'] = bon_prob['bon_3']/4
bon_prob['bon_3'] = bon_prob['bon_3']/4

In [152]:
X_val.reset_index(inplace=True, drop=True)

In [153]:
bon_prob.reset_index(inplace=True, drop=True)

In [154]:
bon_prob['user_id'] = X_val['user_id']

In [155]:
bon_prob['target'] = rf_prob['target']

In [156]:
bon_prob = bon_prob.reindex(columns=['bon_0', 'bon_1', 'bon_2', 'bon_3', 'bon_4', 'user_id', 'target'])

In [157]:
users = pd.read_csv('data/full/train.csv')

In [158]:
users = users[['user_id','target']]

In [159]:
tweets = pd.concat([pd.read_csv('data/propaganda/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/porn/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/spam/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/fake_followers/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/genuine/tweets.csv', sep='\t')[['user_id','full_text']]])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [160]:
tweets = tweets.drop_duplicates()

In [161]:
tweets = pd.merge(tweets, users, on='user_id')

In [162]:
mask = np.in1d(tweets.user_id, X_train.user_id)

In [163]:
val_mask = np.in1d(tweets.user_id, X_val.user_id)

In [164]:
nb_train = tweets[mask]

In [165]:
nb_val = tweets[val_mask]

In [166]:
def remove_rt(x):
    if 'RT @' in x:
        try:
            return x[x.find(':')+2:]
        except:
            return x
    else:
        return x

In [167]:
stop_words = stopwords.words('english')

def remove_stop(x):
    return [word for word in x.split() if word not in stop_words]

In [168]:
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: remove_rt(x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: x.lower())
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: remove_stop(x))
nb_train['full_text'] = nb_train['full_text'].astype(str)
nb_train = nb_train[nb_train['full_text']!='[]']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [169]:
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: remove_rt(x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: x.lower())
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: remove_stop(x))
nb_val['full_text'] = nb_val['full_text'].astype(str)
nb_val = nb_val[nb_val['full_text']!='[]']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [170]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc:(stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = StemmedCountVectorizer(stemmer)

pipeline = Pipeline([('vect', stem_vectorizer), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])

clf = pipeline.fit(nb_train['full_text'], nb_train['target'])

In [171]:
predictions = []
i = 0

for usr in X_val['user_id']:
    
    i+=1
    clear_output()
    print(i)
    tweets_list = nb_val[nb_val.user_id == usr]['full_text']
    tweets_list = tweets_list[tweets_list.notnull()]
    if len(tweets_list[tweets_list.notnull()]) != 0:
        pred = clf.predict_proba(tweets_list)
        predictions.append(np.mean(pred, axis=0))
    else:
        predictions.append(np.array([0.2,0.2,0.2,0.2,0.2]))

5022


In [172]:
prob = pd.DataFrame(predictions, columns=['nb_0', 'nb_1', 'nb_2', 'nb_3', 'nb_4'])

In [173]:
prob['target'] = rf_prob['target']

In [174]:
prob['user_id'] = X_val['user_id']

In [175]:
# prob.to_csv('data/full/predictions1.csv')

In [176]:
stack_df = rf_prob.merge(prob, on=['user_id' , 'target'])

In [177]:
stack_df = stack_df.merge(bon_prob, on=['user_id' , 'target'])

In [178]:
stack_df = stack_df.reindex_axis(sorted(stack_df.columns), axis=1)

  """Entry point for launching an IPython kernel.


In [179]:
stack_df.head()

Unnamed: 0,bon_0,bon_1,bon_2,bon_3,bon_4,nb_0,nb_1,nb_2,nb_3,nb_4,rf_0,rf_1,rf_2,rf_3,rf_4,target,user_id
0,0.028571,0.028571,0.028571,0.028571,0.885714,0.284414,0.062027,0.249144,0.151339,0.253077,0.075,0.05,0.025,0.025,0.825,2,1605070976
1,0.25,0.25,0.25,0.25,0.0,0.2,0.2,0.2,0.2,0.2,1.0,0.0,0.0,0.0,0.0,0,1383685152
2,0.25,0.25,0.25,0.25,0.0,0.2,0.2,0.2,0.2,0.2,0.0,0.0,0.0,1.0,0.0,3,1176422048
3,0.25,0.25,0.25,0.25,0.0,0.2,0.2,0.2,0.2,0.2,0.0,0.0,0.0,1.0,0.0,3,1004089480356167685
4,0.25,0.25,0.25,0.25,0.0,0.01541,0.85148,0.064768,0.001792,0.066551,0.0,1.0,0.0,0.0,0.0,1,789172980676857856


In [180]:
stack_df.to_csv('data/full/logreg_train.csv')

In [181]:
stack_df

Unnamed: 0,bon_0,bon_1,bon_2,bon_3,bon_4,nb_0,nb_1,nb_2,nb_3,nb_4,rf_0,rf_1,rf_2,rf_3,rf_4,target,user_id
0,0.028571,0.028571,0.028571,0.028571,0.885714,0.284414,0.062027,0.249144,0.151339,0.253077,0.075,0.050,0.025,0.025,0.825,2,1605070976
1,0.250000,0.250000,0.250000,0.250000,0.000000,0.200000,0.200000,0.200000,0.200000,0.200000,1.000,0.000,0.000,0.000,0.000,0,1383685152
2,0.250000,0.250000,0.250000,0.250000,0.000000,0.200000,0.200000,0.200000,0.200000,0.200000,0.000,0.000,0.000,1.000,0.000,3,1176422048
3,0.250000,0.250000,0.250000,0.250000,0.000000,0.200000,0.200000,0.200000,0.200000,0.200000,0.000,0.000,0.000,1.000,0.000,3,1004089480356167685
4,0.250000,0.250000,0.250000,0.250000,0.000000,0.015410,0.851480,0.064768,0.001792,0.066551,0.000,1.000,0.000,0.000,0.000,1,789172980676857856
5,0.032143,0.032143,0.032143,0.032143,0.871429,0.200000,0.200000,0.200000,0.200000,0.200000,1.000,0.000,0.000,0.000,0.000,0,223381929
6,0.242857,0.242857,0.242857,0.242857,0.028571,0.200000,0.200000,0.200000,0.200000,0.200000,0.000,0.000,0.000,1.000,0.000,3,617556297
7,0.242857,0.242857,0.242857,0.242857,0.028571,0.138355,0.099142,0.189730,0.006774,0.565999,0.000,0.000,0.975,0.025,0.000,2,80525033
8,0.242857,0.242857,0.242857,0.242857,0.028571,0.060808,0.091396,0.652330,0.016567,0.178899,0.000,0.000,1.000,0.000,0.000,2,2377036634
9,0.250000,0.250000,0.250000,0.250000,0.000000,0.468654,0.111314,0.273150,0.067125,0.079757,1.000,0.000,0.000,0.000,0.000,0,716653062748770304


## Metamodel training

In [182]:
# stack_df = pd.read_csv('data/full/logreg_train.csv').drop(columns=['Unnamed: 0'])

In [192]:
lr = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=2000, class_weight='balanced')

In [193]:
X_train, X_val, y_train, y_val = train_test_split(rf_df.drop(columns=['target']), rf_df['target'], test_size=0.2, random_state=42)

In [194]:
# lr.fit(X_train.drop(columns=['user_id']), y_train)

In [195]:
cv = cross_validate(lr, stack_df.drop(columns=['user_id', 'target']), stack_df['target'], cv=10, scoring='f1_macro')

In [196]:
cv['test_score'].mean()

0.9698209718409172

0.9697595083818902

In [197]:
lr.fit(stack_df.drop(columns=['user_id', 'target']), stack_df['target'])

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=2000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [198]:
z = lr.predict_proba(stack_df.drop(columns=['user_id', 'target']).iloc[1:2])

In [199]:
pickle.dump( lr, open( "../scripts/models/lr.model", "wb" ) )

In [194]:
z = z.reshape(xx0.shape)
plt.figure(1, figsize=(12, 9))
plt.pcolormesh(xx0, xx1, z, cmap=background_cmap)

font = {'family' : 'sans', 'size'   : 32}
plt.rc('font', **font)

for i, color in zip(logistic.classes_, colors):
    idx = np.where(y == i)
    plt.scatter(x[idx, 0], x[idx, 1], c=color) #, cmap=plt.cm.Pastel2)

plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx0.min(), xx0.max())
plt.ylim(xx1.min(), xx1.max())
plt.xticks(())
plt.yticks(())


### plot also the planes
coef = logistic.coef_
intercept = logistic.intercept_

def plot_hyperplane(c, color):
    def line(x0):
        return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
    plt.plot([x0_min, x0_max], [line(x0_min), line(x0_max)],
             ls="--", lw=4, color=color)

# colors = "rgb"
for i, color in zip(logistic.classes_, colors):
    plot_hyperplane(i, color)

plt.show()

NameError: name 'xx0' is not defined