# Classification Predict

In [1360]:
# Imports
from IPython.display import Image
from IPython.display import IFrame
from IPython import display
import pandas as pd
import numpy as np

import nltk
import sklearn
import imblearn
import csv
%matplotlib inline

from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import  word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from imblearn.metrics import classification_report_imbalanced

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.utils import resample

import string
import urllib
import math
import re

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/OmegaSel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/OmegaSel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1361]:
#Read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [1362]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [1363]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [1364]:
train['sentiment'].unique()

array([ 1,  2,  0, -1])

In [1365]:
train['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [1366]:
# Separate classes

class_one = train[train['sentiment'] == 1]
class_two = train[train['sentiment'] == 2]
class_zero = train[train['sentiment'] == 0]
class_neg_one = train[train['sentiment'] == -1]

class_size = 6000

In [1367]:
# Downsample
class_one_downsampled = resample(class_one,
                          replace=False, # sample without replacement (no need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=42) # reproducible results

# Upsample
class_two_upsampled = resample(class_two,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=42) # reproducible results

class_zero_upsampled = resample(class_zero,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=42) # reproducible results

class_neg_one_upsampled = resample(class_neg_one,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=42) # reproducible results

train_resampled = pd.concat([class_one_downsampled, class_two_upsampled, class_zero_upsampled, class_neg_one_upsampled])

In [1368]:
train = train_resampled

In [1369]:
# Remove urls
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = ''
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

# Remove handles
pattern = r'@[^\s]+'
subs = ''
train['message'] = train['message'].replace(to_replace = pattern, value = subs, regex = True)

# Remove punctuation and numbers
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
train['message'] = train['message'].apply(remove_punctuation_numbers)

# Make lower case
train['message'] = train['message'].str.lower()

In [1370]:
train.head()

Unnamed: 0,sentiment,message,tweetid
4722,1,rt watch beforetheflood right here as travel...,555359
14904,1,rt we have a presidentelect who doesnt believ...,504402
12635,1,rt fed court has ruled rights of youth threa...,559217
11307,1,rt the us elected trump but the rest of the w...,936602
12348,1,rt we have a presidentelect who doesnã¢â‚¬â„¢...,639182


In [1371]:
# Tokenize
tokeniser = TweetTokenizer()
train['tokens'] = train['message'].apply(tokeniser.tokenize)

In [1372]:
# Stem or Lemmatize
lemmatizer = WordNetLemmatizer()

def data_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]
train['lemma'] = train['tokens'].apply(data_lemma, args=(lemmatizer, ))

In [1373]:
train.head()

Unnamed: 0,sentiment,message,tweetid,tokens,lemma
4722,1,rt watch beforetheflood right here as travel...,555359,"[rt, watch, beforetheflood, right, here, as, t...","[rt, watch, beforetheflood, right, here, a, tr..."
14904,1,rt we have a presidentelect who doesnt believ...,504402,"[rt, we, have, a, presidentelect, who, doesnt,...","[rt, we, have, a, presidentelect, who, doesnt,..."
12635,1,rt fed court has ruled rights of youth threa...,559217,"[rt, fed, court, has, ruled, rights, of, youth...","[rt, fed, court, ha, ruled, right, of, youth, ..."
11307,1,rt the us elected trump but the rest of the w...,936602,"[rt, the, us, elected, trump, but, the, rest, ...","[rt, the, u, elected, trump, but, the, rest, o..."
12348,1,rt we have a presidentelect who doesnã¢â‚¬â„¢...,639182,"[rt, we, have, a, presidentelect, who, doesnã,...","[rt, we, have, a, presidentelect, who, doesnã,..."


In [1374]:
# Vectorization
tt = TfidfVectorizer(preprocessor=list, tokenizer=list, ngram_range=(1,3), min_df=2, strip_accents='ascii', smooth_idf=False)
train_vec = tt.fit_transform(train['lemma'])

In [1375]:
y = train['sentiment']

In [1376]:
X_train, X_test, y_train, y_test = train_test_split(train_vec, y, test_size=0.20, random_state=42)

In [1377]:
r_forest = RandomForestClassifier(n_estimators = 100, random_state = 42)
r_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [1378]:
y_pred = r_forest.predict(X_test)

In [1379]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.99      0.98      0.98      1212
           0       0.94      0.93      0.94      1197
           1       0.86      0.91      0.88      1210
           2       0.93      0.90      0.92      1181

    accuracy                           0.93      4800
   macro avg       0.93      0.93      0.93      4800
weighted avg       0.93      0.93      0.93      4800



In [1380]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [1381]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10546 non-null  object
 1   tweetid  10546 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 164.9+ KB


In [1382]:
# Remove urls
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = ''
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

# Remove handles
pattern = r'@[^\s]+'
subs = ''
test['message'] = test['message'].replace(to_replace = pattern, value = subs, regex = True)

# Remove punctuation and numbers
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
test['message'] = test['message'].apply(remove_punctuation_numbers)

# Make lower case
test['message'] = test['message'].str.lower()

In [1383]:
# Tokenize
tokeniser = TreebankWordTokenizer()
test['tokens'] = test['message'].apply(tokeniser.tokenize)

In [1384]:
# Stem or Lemmatize
lemmatizer = WordNetLemmatizer()

def data_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]
test['lemma'] = test['tokens'].apply(data_lemma, args=(lemmatizer, ))

In [1385]:
test.head()

Unnamed: 0,message,tweetid,tokens,lemma
0,europe will now be looking to china to make su...,169760,"[europe, will, now, be, looking, to, china, to...","[europe, will, now, be, looking, to, china, to..."
1,combine this with the polling of staffers re c...,35326,"[combine, this, with, the, polling, of, staffe...","[combine, this, with, the, polling, of, staffe..."
2,the scary unimpeachable evidence that climate ...,224985,"[the, scary, unimpeachable, evidence, that, cl...","[the, scary, unimpeachable, evidence, that, cl..."
3,\nputin got to you too jill \ntrump doesn...,476263,"[putin, got, to, you, too, jill, trump, doesnt...","[putin, got, to, you, too, jill, trump, doesnt..."
4,rt female orgasms cause global warming\nsarca...,872928,"[rt, female, orgasms, cause, global, warming, ...","[rt, female, orgasm, cause, global, warming, s..."


In [1386]:
# Vectorization

test_vec = tt.transform(test['lemma'])

In [1387]:
predictions = r_forest.predict(test_vec)

In [1388]:
predictions

array([1, 1, 1, ..., 1, 0, 1])

In [1389]:
df_predictions = pd.DataFrame(predictions)

In [1390]:
df_predictions['tweetid'] = test['tweetid']

In [1391]:
df_predictions.head()

Unnamed: 0,0,tweetid
0,1,169760
1,1,35326
2,1,224985
3,1,476263
4,0,872928


In [1392]:
df_predictions.columns = ['sentiment', 'tweetid']
df_predictions = df_predictions[["tweetid", "sentiment"]]

In [1393]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   tweetid    10546 non-null  int64
 1   sentiment  10546 non-null  int64
dtypes: int64(2)
memory usage: 164.9 KB


In [1394]:
file_name = "selby_submission_00.csv"
df_predictions.to_csv(file_name)
df_predictions.to_csv(file_name, index = False)