### Read and Clean Data

In [2]:
import pandas as pd
import re
import string
import nltk
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

data = pd.read_csv("train.csv")

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = re.sub('http.*', 'http', text)
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    #text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

def day_of_week(text):
    date = re.findall(r'\d+\S\d+\S\d+', text)
    month, day, year = (int(x) for x in date[0].split('/'))    
    ans = datetime.date(year, month, day)
    day_week = ans.weekday()
    return day_week

def get_time(text): 
    time = re.findall('..:+..', text)
    hour, minute = (int(x) for x in time[0].split(':')) 
    minute = round(minute, -1)
    if minute ==60:
        minute = 0
        hour = hour +1
    if hour >24:
        hour = hour % 24
    round_time = hour*100 + minute 
    return round_time


data['text_nolink'] = data['text'].apply(lambda x: re.sub('http.*', 'http', x))
data['text_len'] = data['text_nolink'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text_nolink'].apply(lambda x: count_punct(x))
data['punct%_trans'] = (data['punct%'])**(1/2)
data['time'] = data['created'].apply(lambda x: get_time(x))
data['day_week'] = data['created'].apply(lambda x: day_of_week(x))

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])

ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data['text'])

#X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
#                        data['retweetCount'],pd.DataFrame(X_tfidf.toarray())], axis=1)

X_features_zero = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
                        data['retweetCount']], axis=1)
X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],
                        data['retweetCount'],pd.DataFrame(X_counts.toarray())], axis=1)

## Correlation Matrix

In [5]:
corr = X_features_zero.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)

Unnamed: 0,text_len,punct%,day_week,time,favoriteCount,retweetCount
text_len,1.0,-0.054,0.066,-0.092,0.11,0.059
punct%,-0.054,1.0,-0.014,-0.11,-0.24,-0.18
day_week,0.066,-0.014,1.0,-0.033,0.045,0.03
time,-0.092,-0.11,-0.033,1.0,0.052,0.061
favoriteCount,0.11,-0.24,0.045,0.052,1.0,0.96
retweetCount,0.059,-0.18,0.03,0.061,0.96,1.0


In [6]:
X_features = pd.concat([data['text_len'], data['punct%'],data['day_week'], data['time'], data['favoriteCount'],pd.DataFrame(X_counts.toarray())], axis=1)

### Train-test evaluation 

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

### Vectorize test

In [9]:
from sklearn.ensemble import RandomForestClassifier
n=500

rf = RandomForestClassifier(n_estimators=n, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [10]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')

In [11]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.769 / Recall: 0.974 / Accuracy: 0.83


In [12]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.09559372023513757, 4999),
 (0.07035217202107376, 'text_len'),
 (0.043529265165680384, 'time'),
 (0.03259815609686185, 9659),
 (0.031227706041024452, 10801),
 (0.014573373255956278, 'punct%'),
 (0.011716622584964372, 'favoriteCount'),
 (0.010663088924890107, 6280),
 (0.008226766797264626, 'day_week'),
 (0.005838771759109928, 6275)]

### Tuning Parameters

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from time import time

In [None]:
rf = RandomForestClassifier()

param_dist = {"n_estimators": [40,None],
              "max_depth": [3, None]}
n_iter_search = 4

clf = RandomizedSearchCV(rf, param_distributions = param_dist,n_iter=n_iter_search, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_features, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]