### Training

In [36]:
import pandas as pd
import re
import string
import nltk
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = re.sub('http.*', 'http', text)
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

def day_of_week(text):
    date = re.findall(r'\d+\S\d+\S\d+', text)
    month, day, year = (int(x) for x in date[0].split('/'))    
    ans = datetime.date(year, month, day)
    day_week = ans.weekday()
    return day_week

def get_time(text): 
    time = re.findall('..:+..', text)
    hour, minute = (int(x) for x in time[0].split(':')) 
    minute = round(minute, -1)
    if minute ==60:
        minute = 0
        hour = hour +1
    if hour >24:
        hour = hour % 24
    round_time = hour*100 + minute 
    return round_time


data['text_nolink'] = data['text'].apply(lambda x: re.sub('http.*', 'http', x))
data['text_len'] = data['text_nolink'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text_nolink'].apply(lambda x: count_punct(x))
data['punct%_trans'] = (data['punct%'])**(1/2)
data['time'] = data['created'].apply(lambda x: get_time(x))
data['day_week'] = data['created'].apply(lambda x: day_of_week(x))

test['text_nolink'] = test['text'].apply(lambda x: re.sub('http.*', 'http', x))
test['text_len'] = test['text_nolink'].apply(lambda x: len(x) - x.count(" "))
test['punct%'] = test['text_nolink'].apply(lambda x: count_punct(x))
test['punct%_trans'] = (test['punct%'])**(1/2)
test['time'] = test['created'].apply(lambda x: get_time(x))
test['day_week'] = test['created'].apply(lambda x: day_of_week(x))


In [37]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(data['text'])

X_tfidf_train = tfidf_vect_fit.transform(data['text'])
X_tfidf_test = tfidf_vect_fit.transform(test['text'])

X_features = pd.concat([data[['text_len', 'punct%', 'day_week', 'time' , 'favoriteCount']].reset_index(drop=True),
                        pd.DataFrame(X_tfidf_train.toarray())], axis=1)
X_test = pd.concat([test[['text_len', 'punct%', 'day_week', 'time' , 'favoriteCount'
                        ]].reset_index(drop=True),pd.DataFrame(X_tfidf_test.toarray())], axis=1)
y_train = data['label']

In [38]:
features = pd.get_dummies(X_features, columns = ["day_week"])
features.head(5)
X_test = pd.get_dummies(X_test, columns = ["day_week"])

In [39]:
from sklearn.ensemble import RandomForestClassifier
n_estimators=100

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf_model = rf.fit(features, y_train)

y_pred = rf_model.predict(X_test)

In [40]:
sum(y_pred==-1)

107

In [25]:
prediction = pd.DataFrame(y_pred, columns=['Label']).to_csv('prediction.csv')

In [None]:
X_features

# Data Visualization

## Feature Importance

In [41]:
feature_list = list(features.columns)
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable:                 1300 Importance: 0.11
Variable: text_len             Importance: 0.07
Variable: time                 Importance: 0.05
Variable:                 2686 Importance: 0.05
Variable:                 2601 Importance: 0.04
Variable: punct%               Importance: 0.02
Variable: favoriteCount        Importance: 0.02
Variable:                 1646 Importance: 0.02
Variable:                 2143 Importance: 0.02
Variable:                  226 Importance: 0.01
Variable:                 1171 Importance: 0.01
Variable:                 1265 Importance: 0.01
Variable:                 2684 Importance: 0.01
Variable:                 2807 Importance: 0.01
Variable: day_week_4           Importance: 0.01
Variable:                    0 Importance: 0.0
Variable:                    1 Importance: 0.0
Variable:                    2 Importance: 0.0
Variable:                    3 Importance: 0.0
Variable:                    4 Importance: 0.0
Variable:                    5 Importance: 0.

Variable:                 2291 Importance: 0.0
Variable:                 2292 Importance: 0.0
Variable:                 2293 Importance: 0.0
Variable:                 2294 Importance: 0.0
Variable:                 2295 Importance: 0.0
Variable:                 2296 Importance: 0.0
Variable:                 2297 Importance: 0.0
Variable:                 2298 Importance: 0.0
Variable:                 2299 Importance: 0.0
Variable:                 2300 Importance: 0.0
Variable:                 2301 Importance: 0.0
Variable:                 2302 Importance: 0.0
Variable:                 2303 Importance: 0.0
Variable:                 2304 Importance: 0.0
Variable:                 2305 Importance: 0.0
Variable:                 2306 Importance: 0.0
Variable:                 2307 Importance: 0.0
Variable:                 2308 Importance: 0.0
Variable:                 2309 Importance: 0.0
Variable:                 2310 Importance: 0.0
Variable:                 2311 Importance: 0.0
Variable:    

In [42]:
# Extract the most important features
important_indices = np.array([feature_list.index(1300), feature_list.index('text_len'), feature_list.index('time'),
                    feature_list.index(2686),feature_list.index(2601),feature_list.index('punct%'),
                    feature_list.index('favoriteCount'),feature_list.index(1646),feature_list.index(2143)])



In [43]:
X_tfidf_train.columns = tfidf_vect.get_feature_names()

print(X_tfidf_train.columns[1300])
print(X_tfidf_train.columns[2686])
print(X_tfidf_train.columns[2601])
print(X_tfidf_train.columns[1646])
print(X_tfidf_train.columns[2143])

http
trump2016
thank
makeamericagreatagain
realdonaldtrump


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(10))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');