In [323]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import pickle
import tweepy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [324]:
rf_df = pd.read_csv('data/full/rf_preproc_train.csv')

In [325]:
rf_df = rf_df[['user_id', 'avg_fav', 'avg_hash', 'avg_len', 'avg_ret', 'default_profile',
       'fake_words_score', 'favourites_count', 'followers_count', 'freq',
       'friends_count', 'genuine_words_score', 'listed_count', 'max_fav',
       'max_hash', 'max_len', 'max_ret', 'media_perc', 'min_fav', 'min_hash',
       'min_len', 'min_ret', 'porn_words_score',
       'profile_use_background_image', 'prop_words_score', 'quote_perc',
       'ret_perc', 'spam_words_score', 'statuses_count',
       'tweet_intradistance', 'url', 'url_intradistance', 'url_perc',
       'description_len', 'name_len', 'screen_name_len', 'age','target']]

In [326]:
X_train, X_val, y_train, y_val = train_test_split(rf_df.drop(columns=['target']), rf_df['target'], test_size=0.2, random_state=42)

In [327]:
rf = RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=5) #max_depth=24

In [328]:
cv = cross_validate(rf, X_train.drop(columns=['user_id']), y_train, cv=10, scoring='f1_macro')

In [329]:
cv['test_score'].mean()

0.9243772921886872

In [330]:
rf.fit(X_train.drop(columns=['user_id']), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [331]:
rf_pred = rf.predict_proba(X_val.drop(columns=['user_id']))

In [332]:
rf.predict_proba(X_val.drop(columns=['user_id']).iloc[0:1])

array([[0.57282846, 0.00221002, 0.02502992, 0.24615608, 0.15377553]])

In [333]:
X_val.iloc[0:1]['user_id']

4347    610392907
Name: user_id, dtype: int64

In [334]:
rf_prob = pd.DataFrame(rf_pred, columns=['rf_0', 'rf_1', 'rf_2', 'rf_3', 'rf_4'])

In [335]:
X_val.reset_index(inplace=True, drop=True)

In [336]:
rf_prob.reset_index(inplace=True, drop=True)

In [337]:
y_val.reset_index(inplace=True, drop=True)

In [338]:
rf_prob['user_id'] = X_val['user_id']

In [339]:
rf_prob['target'] = y_val

## Random Forest df è pronto

In [341]:
users = pd.read_csv('data/full/train.csv')

In [342]:
users = users[['user_id','target']]

In [343]:
tweets = pd.concat([pd.read_csv('data/propaganda/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/porn/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/spam/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/fake_followers/tweets.csv', sep='\t')[['user_id','full_text']], \
          pd.read_csv('data/genuine/tweets.csv', sep='\t')[['user_id','full_text']]])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [344]:
tweets = tweets.drop_duplicates()

In [345]:
tweets = pd.merge(tweets, users, on='user_id')

In [346]:
mask = np.in1d(tweets.user_id, X_train.user_id)

In [347]:
val_mask = np.in1d(tweets.user_id, X_val.user_id)

In [349]:
nb_train = tweets[mask]

In [350]:
nb_val = tweets[val_mask]

In [351]:
def remove_rt(x):
    if 'RT @' in x:
        try:
            return x[x.find(':')+2:]
        except:
            return x
    else:
        return x

In [352]:
stop_words = stopwords.words('english')

def remove_stop(x):
    return [word for word in x.split() if word not in stop_words]

In [353]:
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: remove_rt(x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: x.lower())
nb_train['full_text'] = nb_train['full_text'].apply(lambda x: remove_stop(x))
nb_train['full_text'] = nb_train['full_text'].astype(str)
nb_train = nb_train[nb_train['full_text']!='[]']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [354]:
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: remove_rt(x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: re.sub(r'[^\w\s]','',x))
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: x.lower())
nb_val['full_text'] = nb_val['full_text'].apply(lambda x: remove_stop(x))
nb_val['full_text'] = nb_val['full_text'].astype(str)
nb_val = nb_val[nb_val['full_text']!='[]']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [356]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc:(stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = StemmedCountVectorizer(stemmer)

pipeline = Pipeline([('vect', stem_vectorizer), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])

clf = pipeline.fit(nb_train['full_text'], nb_train['target'])

In [357]:
predictions = []
i = 0

for usr in X_val['user_id']:
    
    i+=1
    clear_output()
    print(i)
    tweets_list = nb_val[nb_val.user_id == usr]['full_text']
    tweets_list = tweets_list[tweets_list.notnull()]
    if len(tweets_list[tweets_list.notnull()]) != 0:
        pred = clf.predict_proba(tweets_list)
        predictions.append(np.mean(pred, axis=0))
    else:
        predictions.append(np.array([0.2,0.2,0.2,0.2,0.2]))

5022


In [358]:
prob = pd.DataFrame(predictions, columns=['nb_0', 'nb_1', 'nb_2', 'nb_3', 'nb_4'])

In [359]:
prob['target'] = y_val

In [360]:
prob['user_id'] = X_val['user_id']

In [361]:
# prob.to_csv('data/full/predictions1.csv')

In [362]:
stack_df = rf_prob.merge(prob, on=['user_id' , 'target'])

In [363]:
stack_df = stack_df.reindex_axis(sorted(stack_df.columns), axis=1)

  """Entry point for launching an IPython kernel.


In [364]:
stack_df.head()

Unnamed: 0,nb_0,nb_1,nb_2,nb_3,nb_4,rf_0,rf_1,rf_2,rf_3,rf_4,target,user_id
0,0.2,0.2,0.2,0.2,0.2,0.572828,0.00221,0.02503,0.246156,0.153776,0,610392907
1,0.2,0.2,0.2,0.2,0.2,0.745185,0.003614,0.005721,0.193221,0.052259,0,239615243
2,0.022305,0.844103,0.065607,0.003753,0.064231,0.004882,0.943479,0.007646,0.007892,0.036101,1,705441738391838720
3,0.013702,0.883903,0.049563,0.000985,0.051848,0.04767,0.710607,0.097362,0.007896,0.136464,1,32871086
4,0.2,0.2,0.2,0.2,0.2,0.831727,0.002515,0.006933,0.098057,0.060768,0,185036273


In [365]:
# stack_df.to_csv('data/full/logreg_train.csv')

## Metamodel training

In [366]:
lr = LogisticRegressionCV(penalty='l2', solver='saga', max_iter=1000)

In [367]:
cv = cross_validate(lr, stack_df.drop(columns=['user_id', 'target']), stack_df['target'], cv=10, scoring='f1_macro')

In [309]:
cv['test_score'].mean()

0.9616951208651816

In [368]:
scores = lr.fit(stack_df.drop(columns=['user_id', 'target']),  stack_df['target']).scores_

In [369]:
scores

{0: array([[0.71641791, 0.71641791, 0.92955224, 0.96477612, 0.96656716,
         0.96776119, 0.96776119, 0.96776119, 0.96776119, 0.96776119],
        [0.71641791, 0.71641791, 0.93313433, 0.96537313, 0.97014925,
         0.9719403 , 0.97253731, 0.97373134, 0.97373134, 0.97373134],
        [0.71650718, 0.71650718, 0.93660287, 0.96650718, 0.97129187,
         0.97308612, 0.97308612, 0.97368421, 0.97368421, 0.97368421]]),
 1: array([[0.88656716, 0.88656716, 0.88656716, 0.97850746, 0.98328358,
         0.98567164, 0.98567164, 0.98626866, 0.98626866, 0.98626866],
        [0.88656716, 0.88656716, 0.88656716, 0.98208955, 0.98985075,
         0.99223881, 0.99283582, 0.99283582, 0.99283582, 0.99283582],
        [0.88636364, 0.88636364, 0.88636364, 0.98504785, 0.98803828,
         0.98923445, 0.98923445, 0.98744019, 0.98744019, 0.98744019]]),
 2: array([[0.78089552, 0.78089552, 0.9438806 , 0.97791045, 0.97910448,
         0.98029851, 0.97970149, 0.98029851, 0.98029851, 0.98029851],
        [0.780

In [380]:
lr.predict_proba(stack_df.drop(columns=['user_id', 'target']).iloc[165:166])

array([[6.30692324e-03, 6.20944145e-04, 2.85342086e-04, 9.88274540e-01,
        4.51225033e-03]])