### Training

In [1]:
import pandas as pd
import re
import string
import nltk
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = re.sub('http.*', 'http', text)
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

def day_of_week(text):
    date = re.findall(r'\d+\S\d+\S\d+', text)
    month, day, year = (int(x) for x in date[0].split('/'))    
    ans = datetime.date(year, month, day)
    day_week = ans.weekday()
    return day_week

def get_time(text): 
    time = re.findall('..:+..', text)
    hour, minute = (int(x) for x in time[0].split(':')) 
    minute = round(minute, -1)
    if minute ==60:
        minute = 0
        hour = hour +1
    if hour >24:
        hour = hour % 24
    round_time = hour*100 + minute 
    return round_time


data['text_nolink'] = data['text'].apply(lambda x: re.sub('http.*', 'http', x))
data['text_len'] = data['text_nolink'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text_nolink'].apply(lambda x: count_punct(x))
data['punct%_trans'] = (data['punct%'])**(1/2)
data['time'] = data['created'].apply(lambda x: get_time(x))
data['day_week'] = data['created'].apply(lambda x: day_of_week(x))

test['text_nolink'] = test['text'].apply(lambda x: re.sub('http.*', 'http', x))
test['text_len'] = test['text_nolink'].apply(lambda x: len(x) - x.count(" "))
test['punct%'] = test['text_nolink'].apply(lambda x: count_punct(x))
test['punct%_trans'] = (test['punct%'])**(1/2)
test['time'] = test['created'].apply(lambda x: get_time(x))
test['day_week'] = test['created'].apply(lambda x: day_of_week(x))


FileNotFoundError: File b'test.csv' does not exist

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(data['text'])

X_tfidf_train = tfidf_vect_fit.transform(data['text'])
X_tfidf_test = tfidf_vect_fit.transform(test['text'])

X_features = pd.concat([data[['text_len', 'punct%', 'day_week', 'time' , 'favoriteCount',
                        'retweetCount']].reset_index(drop=True),pd.DataFrame(X_tfidf_train.toarray())], axis=1)
X_test = pd.concat([test[['text_len', 'punct%', 'day_week', 'time' , 'favoriteCount',
                        'retweetCount']].reset_index(drop=True),pd.DataFrame(X_tfidf_test.toarray())], axis=1)
y_train = data['label']

In [3]:
from sklearn.ensemble import RandomForestClassifier
n_estimators=100

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf_model = rf.fit(X_features, y_train)

y_pred = rf_model.predict(X_test)

In [6]:
sum(y_pred==-1)

110

In [7]:
prediction = pd.DataFrame(y_pred, columns=['Label']).to_csv('prediction.csv')

In [None]:
X_features