# Climate change tweet classification

The purpose of this excercise is to create a machine learning model that is able to classify whether or not a person believes in climate change, based on their novel tweet data.

# Importing libraries and models

In [40]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix

# Reading in the data

In [41]:
train = pd.read_csv("C:/Users/HP/Downloads/train.csv")
test = pd.read_csv("C:/Users/HP/Downloads/test.csv")

In [42]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [43]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


# Cleaning the train dataset

Removing urls

In [44]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

Making the messages lowercase

In [45]:
train['message'] = train['message'].str.lower()

 Stripping out punctuation marks and numerals

In [46]:
import string
def remove_punctuation_numbers(message):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in message if l not in punc_numbers])

train['message'] = train['message'].apply(remove_punctuation_numbers)

In [48]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
2,2,rt rawstory researchers say we have three year...,698562
3,1,todayinmaker wired was a pivotal year in the...,573736
4,1,rt soynoviodetodas its and a racist sexist cl...,466954


Checking the number of messages per sentiment

In [49]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

The number of messages per sentiment shows that the data is not balanced, therefore, resampling will be applied.

# Preprocessing

Assiging labels to each sentiment group as defined in the dataset description

In [50]:
news = train[train['sentiment']==1]
pro = train[train['sentiment']==2]
neutral = train[train['sentiment']==0]
anti = train[train['sentiment']==-1]

Resampling the data

In [51]:
from sklearn.utils import resample

pro_upsampled = resample(pro,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=len(news), # match number in minority class
                          random_state=27) # reproducible results

neutral_upsampled = resample(neutral,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=len(news), # match number in minority class
                          random_state=27) # reproducible results

anti_upsampled = resample(anti,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=len(news), # match number in minority class
                          random_state=27) # reproducible results

The sampled dataset will be used to train the model

In [52]:
sampled_data = pd.concat([pro_upsampled, neutral_upsampled, anti_upsampled, news])

In [53]:
sampled_data.head()

Unnamed: 0,sentiment,message,tweetid
4401,2,cbcquirks quirkquestions should canada go ahea...,634603
11162,2,rt thinkprogress interior scientist says the a...,793130
3062,2,rt markdistef senator malcolm roberts to breit...,990850
5669,2,rt sciam for the third year in a row the carbo...,418438
13403,2,rt antarcticreport john kerry leaves nz for mc...,29803


Checking the number of messages per sentiment group of the balanced dataset

In [54]:
sampled_data['sentiment'].value_counts()

-1    8530
 2    8530
 1    8530
 0    8530
Name: sentiment, dtype: int64

Assigning response and features variables

In [55]:
y_sampled = sampled_data['sentiment']
X_sampled = sampled_data['message']

In [56]:
len(y_sampled), len(X_sampled)

(34120, 34120)

Fitting the data to the Tfid Vectorizer

In [57]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=3.0, min_df=3)
X_vectorized = vectorizer.fit_transform(X_sampled)

In [58]:
X_vectorized.shape

(34120, 55245)

Splitting data into train and validation sets

In [59]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y_sampled,test_size=0.2, random_state=42, stratify=y_sampled)

In [60]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((27296, 55245), (6824, 55245), (27296,), (6824,))

# Data modelling

# Building and training the classification models

In [61]:
# Logistic regression model
logreg = LogisticRegression(multi_class='ovr', max_iter=2000)
logreg.fit(X_train, y_train)

# Generating predictions
y_logreg_pred_val = logreg.predict(X_val)

In [62]:
# KNearest Neighbors model
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)

# Generating predictions
y_kn_pred_val = kn.predict(X_val)

In [63]:
# Linear SVM
lsvm = SVC(kernel="linear")
lsvm.fit(X_train, y_train)

# Generating predictions
y_lsvm_pred_val = lsvm.predict(X_val)

In [64]:
#Training the data on RBF SVM
rbfsvm = SVC(kernel="rbf", gamma=2, C=1)
rbfsvm.fit(X_train, y_train)

# Generate predictions
y_rbfsvm_pred_val = rbfsvm.predict(X_val)

In [65]:
# Decision Tree
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train, y_train)

#Get predictions
y_dtc_pred_val = dtc.predict(X_val)

In [66]:
# Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

#Get predictions
y_rfc_pred_val = rfc.predict(X_val)

In [67]:
# AdaBoost
adb = AdaBoostClassifier()
adb.fit(X_train, y_train)

#Get predictions
y_adb_pred_val = adb.predict(X_val)

# Checking model accuracy

In [68]:
# f1 scores
print('Logistic Regression F1 Score  : ', f1_score(y_val, y_logreg_pred_val, average="macro"))
print('Nearest Neighbors F1 Score    : ', f1_score(y_val, y_kn_pred_val, average="macro"))
print('Linear SVM F1 Score           : ', f1_score(y_val, y_lsvm_pred_val, average="macro"))
print('RBF SVM F1 Score              : ', f1_score(y_val, y_rbfsvm_pred_val, average="macro"))
print('Decision Tree F1 Score        : ', f1_score(y_val, y_dtc_pred_val, average="macro"))
print('Random Forest F1 Score        : ', f1_score(y_val, y_rfc_pred_val, average="macro"))
print('AdaBoost F1 Score             : ', f1_score(y_val, y_adb_pred_val, average="macro"))

Logistic Regression F1 Score  :  0.9158974243101963
Nearest Neighbors F1 Score    :  0.832097831885686
Linear SVM F1 Score           :  0.9396640187252598
RBF SVM F1 Score              :  0.9597033421533123
Decision Tree F1 Score        :  0.562005204701705
Random Forest F1 Score        :  0.9527550493305146
AdaBoost F1 Score             :  0.6134550162950387


In [69]:
# Classification reports
print('Logistic Regression')
print(classification_report(y_val, y_logreg_pred_val))
print('Nearest Neighbors')
print(classification_report(y_val, y_kn_pred_val))
print('Linear SVC')
print(classification_report(y_val, y_lsvm_pred_val))
print('RBF SVC')
print(classification_report(y_val, y_rbfsvm_pred_val))
print('Decision Tree Classifier')
print(classification_report(y_val, y_dtc_pred_val))
print('Random Forest Calssifier')
print(classification_report(y_val, y_rfc_pred_val))
print('AdaBoost')
print(classification_report(y_val, y_adb_pred_val))

Logistic Regression
              precision    recall  f1-score   support

          -1       0.97      0.98      0.97      1706
           0       0.92      0.92      0.92      1706
           1       0.90      0.82      0.86      1706
           2       0.88      0.95      0.92      1706

    accuracy                           0.92      6824
   macro avg       0.92      0.92      0.92      6824
weighted avg       0.92      0.92      0.92      6824

Nearest Neighbors
              precision    recall  f1-score   support

          -1       0.90      0.96      0.93      1706
           0       0.86      0.78      0.82      1706
           1       0.73      0.74      0.74      1706
           2       0.85      0.85      0.85      1706

    accuracy                           0.83      6824
   macro avg       0.83      0.83      0.83      6824
weighted avg       0.83      0.83      0.83      6824

Linear SVC
              precision    recall  f1-score   support

          -1       0.99   

From the above results, it shows tht Decision Tree and AdaBoost classifiers did not perform well. Next, I will apply Logistics Regression, KNearest Neighbours, Linear SVM. RBF and Random Forest on the unknown data from the test dataset to generate predictions,submit to Kaggle and assess Kaggle scores. The Linear SVM model (0.71431) and the Logistic Regression model (0.71230) scored higher than the other models on Kaggle; therefore, the final submission is Linear SVM, which has the highest score.

# Fitting Linear SVM model to test dataset for Kaggle submission

In [70]:
testx = test['message']
test_vect = vectorizer.transform(testx)

In [71]:
y_lsvm_pred_test = lsvm.predict(test_vect)
test['sentiment'] = y_lsvm_pred_test
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [72]:
test[['tweetid','sentiment']].to_csv('submission26.csv', index=False)

In [73]:
print("Kaggle Submission: 0.71431")

Kaggle Submission: 0.71431
