In [1]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-20.2.4-py2.py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 587 kB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.2.2
    Uninstalling pip-20.2.2:
      Successfully uninstalled pip-20.2.2
Successfully installed pip-20.2.4


# Import Libraries

In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import os
import string
import xgboost
import time
import gc
import warnings

from sklearn import metrics
from nltk import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn import preprocessing
from sklearn import svm, tree
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from wordcloud import WordCloud
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score,roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler

from imblearn.over_sampling import SMOTE

# Import datasets

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/climate-change-edsa2020-21/train.csv
/kaggle/input/climate-change-edsa2020-21/test.csv
/kaggle/input/climate-change-edsa2020-21/sample_submission.csv


In [4]:
train  = pd.read_csv("/kaggle/input/climate-change-edsa2020-21/train.csv")

In [5]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [6]:
test  = pd.read_csv("/kaggle/input/climate-change-edsa2020-21/test.csv")

In [7]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [8]:
train.describe()

Unnamed: 0,sentiment,tweetid
count,15819.0,15819.0
mean,0.917504,501719.433656
std,0.836537,289045.983132
min,-1.0,6.0
25%,1.0,253207.5
50%,1.0,502291.0
75%,1.0,753769.0
max,2.0,999888.0


# Check for missing data

In [9]:
print(f'Missing values in train dataset:\n{train.isna().sum()}\n')
print(f'Missing values in test dataset:\n{test.isna().sum()}')

Missing values in train dataset:
sentiment    0
message      0
tweetid      0
dtype: int64

Missing values in test dataset:
message    0
tweetid    0
dtype: int64


# Label Analysis - Sentiment

In [10]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

As we can see from above, data is very unbalanced

In [11]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


# Cleaning Data

In [12]:
def clean(message):
    
    message = message.lower()
    
    url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
    web = 'url-web'
    message = re.sub(url, web, message)

    message = re.sub(r'[-]',' ',message)
    message = re.sub(r'[^\w\s]','',message)
    message = re.sub('[0-9]+', '', message)
    
    nltk_stopword = nltk.corpus.stopwords.words('english')
    message = ' '.join([item for item in message.split() if item not in nltk_stopword])
    
    message = message.lower()
    lemmatizer = WordNetLemmatizer()
    message = [lemmatizer.lemmatize(token) for token in message.split(" ")]

    message = [lemmatizer.lemmatize(token, "v") for token in message]
    message = " ".join(message)
    
    return message


In [13]:
train['message_clean']=train['message'].apply(clean)
test['message_clean']=test['message'].apply(clean)

In [14]:
from sklearn.utils import resample

train_pro = train[train.sentiment==1]
train_news = train[train.sentiment==2]
train_anti = train[train.sentiment==-1]
train_neutral = train[train.sentiment==0]

train_pro_downsample = resample(train_pro, 
                                 replace=False,    
                                 n_samples=5000,  
                                 random_state=42) 

train_anti_upsample = resample(train_anti, 
                                 replace=True,    
                                 n_samples=5000,  
                                 random_state=42)

train_rebalance = pd.concat([train_pro_downsample,train_news,train_anti_upsample,train_neutral])
train_rebalance.sentiment.value_counts()



-1    5000
 1    5000
 2    3640
 0    2353
Name: sentiment, dtype: int64

#  Splitting x and y variables

In [15]:
y = train_rebalance['sentiment']
X = train_rebalance['message_clean']

# Vectorizing Text

In [16]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

# Split train data into train_val set

In [17]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.05, random_state = 42)
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.20,shuffle=True, stratify=y, random_state=42)

In [18]:
#List possible classifiers

models = [
               svm.SVC(probability=True),
               tree.DecisionTreeClassifier(),
               RandomForestClassifier(n_estimators = 10),
               LogisticRegression(solver = "liblinear"),
               GradientBoostingClassifier(),
               xgboost.XGBClassifier()
    ]

In [19]:
for model in models:
    t0 = time.time()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val,y_pred)
    bin_clf_rep = classification_report(y_val,y_pred, zero_division=1)
    name = str(model)
    print(name[0:name.find("(")])
    print("Accuracy :", score)
    print(bin_clf_rep)
    print(confusion_matrix(y_val,y_pred))
    print("Time Taken :", time.time()-t0, "seconds")
    print("------------------------------------------------------------")

SVC
Accuracy : 0.8036886527039699
              precision    recall  f1-score   support

          -1       0.97      0.97      0.97      1000
           0       0.76      0.38      0.51       471
           1       0.70      0.82      0.76      1000
           2       0.76      0.82      0.79       728

    accuracy                           0.80      3199
   macro avg       0.80      0.75      0.76      3199
weighted avg       0.81      0.80      0.79      3199

[[969   6  19   6]
 [ 16 179 219  57]
 [  6  46 823 125]
 [  8   5 115 600]]
Time Taken : 197.12941002845764 seconds
------------------------------------------------------------
DecisionTreeClassifier
Accuracy : 0.7099093466708346
              precision    recall  f1-score   support

          -1       0.90      0.97      0.94      1000
           0       0.48      0.45      0.46       471
           1       0.64      0.58      0.60      1000
           2       0.66      0.70      0.68       728

    accuracy                

# Train model and evaluate validation set

In [20]:
svc_clf = svm.SVC(probability=True)
svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_val)

In [21]:
f1_score(y_val, svc_pred, average = 'macro')

0.7559600540848235

# Get test set ready

In [22]:
testx = test['message_clean']
test_vect = vectorizer.transform(testx)

# Make predictions on the test set and adding a sentiment column to our original test df

In [23]:
y_pred = svc_clf.predict(test_vect)

In [24]:
test['sentiment'] = y_pred

In [25]:
test.head()

Unnamed: 0,message,tweetid,message_clean,sentiment
0,Europe will now be looking to China to make su...,169760,europe look china make sure alone fight climat...,1
1,Combine this with the polling of staffers re c...,35326,combine poll staffer climate change woman righ...,1
2,"The scary, unimpeachable evidence that climate...",224985,scary unimpeachable evidence climate change al...,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,karoli morgfair osborneink dailykos putin get ...,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,rt fakewillmoore female orgasm cause global wa...,0


# Creating an output csv for submission

In [26]:
test[['tweetid', 'sentiment']].to_csv('testsubmission.csv', index = False)