### Read and Clean Data

In [6]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

data = pd.read_csv("train.csv")

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = re.sub('http.*', 'http', text)
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    #text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

def day_of_week(text):
    date = re.findall(r'\d+\S\d+\S\d+', text)
    month, day, year = (int(x) for x in date[0].split('/'))    
    ans = datetime.date(year, month, day)
    day_week = ans.weekday()
    return day_week

def get_time(text): 
    time = re.findall('..:+..', text)
    hour, minute = (int(x) for x in time[0].split(':')) 
    minute = round(minute, -1)
    if minute ==60:
        minute = 0
        hour = hour +1
    if hour >24:
        hour = hour % 24
    round_time = hour*100 + minute 
    return round_time


data['text_nolink'] = data['text'].apply(lambda x: re.sub('http.*', 'http', x))
data['text_len'] = data['text_nolink'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text_nolink'].apply(lambda x: count_punct(x))
data['punct%_trans'] = (data['punct%'])**(1/2)
data['time'] = data['created'].apply(lambda x: get_time(x))
data['day_week'] = data['created'].apply(lambda x: day_of_week(x))

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])

ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data['text'])

#X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
#                        data['retweetCount'],pd.DataFrame(X_tfidf.toarray())], axis=1)

X_features_zero = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
                        data['retweetCount']], axis=1)
X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
                        data['retweetCount'],pd.DataFrame(X_counts.toarray())], axis=1)

## Correlation Matrix

In [3]:
corr = X_features_zero.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)

Unnamed: 0,text_len,punct%,day_week,time,favoriteCount,retweetCount
text_len,1.0,-0.054,0.066,-0.092,0.11,0.059
punct%,-0.054,1.0,-0.014,-0.11,-0.24,-0.18
day_week,0.066,-0.014,1.0,-0.033,0.045,0.03
time,-0.092,-0.11,-0.033,1.0,0.052,0.061
favoriteCount,0.11,-0.24,0.045,0.052,1.0,0.96
retweetCount,0.059,-0.18,0.03,0.061,0.96,1.0


We only keep one of the features between "favoriteCount" and "retweetCount" since they are highly correlated.

In [7]:
X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],pd.DataFrame(X_counts.toarray())], axis=1)
X_features.head()

Unnamed: 0,text_len,punct%,day_week,time,favoriteCount,0,1,2,3,4,...,12261,12262,12263,12264,12265,12266,12267,12268,12269,12270
0,103,2.9,1,100,14207,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28,7.1,0,2220,9666,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,31,9.7,0,2140,25531,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,116,4.3,0,1950,28850,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,70,8.6,0,1200,12567,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train-test evaluation 

In [8]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
n=500

rf = RandomForestClassifier(n_estimators=n, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [24]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')

In [25]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.842 / Recall: 0.936 / Accuracy: 0.862


In [9]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.09823599006386, 4999),
 (0.06721652627127792, 'text_len'),
 (0.03585222358943195, 'time'),
 (0.029045222070738647, 9659),
 (0.02804277717514518, 10801),
 (0.014777002178556849, 'punct%'),
 (0.010893290626243462, 'favoriteCount'),
 (0.009922738767070546, 6280),
 (0.008241286503650014, 'day_week'),
 (0.006874305039625653, 5689)]

### Tuning Parameters

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
import numpy as np
rf = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =10)]
# Number of features to consider at every split 
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3,30, num = 3)]

random_grid = { 'n_estimators': n_estimators, 
                 'max_features': max_features,
                'max_depth': max_depth}

n_iter_search = 20

clf = RandomizedSearchCV(rf, param_distributions = random_grid,n_iter=n_iter_search, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_train, y_train)
#pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [53]:
clf.best_params_

{'n_estimators': 1600, 'max_features': 'sqrt', 'max_depth': 30}

## Boosting

In [4]:
import xgboost as xgb

In [10]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [16]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
y_pred= gbm.predict(X_test)

  if diff:


In [18]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.849 / Recall: 0.877 / Accuracy: 0.844


### tuning parameters

In [23]:
# Learning Rate to avoid overfitting
learning_rate = [x for x in np.linspace(0.01,0.1, num = 10)]
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =20)]
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3,10, num = 7)]
#
subsample = [x for x in np.linspace(0.8,1, num = 5)]
#
colsample_bytree = [x for x in np.linspace(0.3,0.8, num = 5)]
#
gamma = [0,1,5]

random_grid = {  'learning_rate': learning_rate ,'n_estimators': n_estimators, 
                'max_depth': max_depth, 'subsample': subsample,
                  'colsample_bytree': colsample_bytree, 'gamma': gamma}

n_iter_search = 10
bst = xgb.XGBClassifier()
clf = RandomizedSearchCV(bst, param_distributions = random_grid,n_iter=n_iter_search, cv=5, n_jobs=1).fit(X_train, y_train)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [24]:
clf.best_params_

{'subsample': 0.8,
 'n_estimators': 578,
 'max_depth': 4,
 'learning_rate': 0.08,
 'gamma': 1,
 'colsample_bytree': 0.425}

# Model Evaluation