# Machine Learning: Classification Predict Team 4

## Problem Statement:

### Global Warming Sentiment Analysis using Twitter Data

## Import libraries and datasets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

### Train dataset

In [3]:
train_df = pd.read_csv('data/train.csv')
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


### Test dataset

In [4]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


### Sample solution

In [5]:
ss = pd.read_csv('data/sample_submission.csv')
ss.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1


# 1. Data Cleaning

In [6]:
df = train_df.copy()
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [12]:
def clean_text(df):
    data = df.copy()
    # Removing mentions and hashtags
    data.message = data.message.apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","",x))
    # Removing https/http links
    data.message = data.message.apply(lambda x: re.sub('http[s]?://\S+', '', x))
    # Replacing non words or punctuation with spaces
    data.message = data.message.apply(lambda x: re.sub('\W',' ',x))
    # Removing numbers
    data.message = data.message.apply(lambda x: re.sub('\d+','',x.lower()))
    return data

In [13]:
df = clean_text(df)
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
2,2,rt researchers say we have three years to act...,698562
3,1,todayinmaker wired was a pivotal year in the...,573736
4,1,rt its and a racist sexist climate change de...,466954


In [14]:
test = clean_text(test_df)
test.head()

Unnamed: 0,message,tweetid
0,europe will now be looking to china to make su...,169760
1,combine this with the polling of staffers re c...,35326
2,the scary unimpeachable evidence that climate ...,224985
3,putin got to you too jill trump doesnt be...,476263
4,rt female orgasms cause global warmingsarcast...,872928


### 1.1 Vectorize

In [15]:
def vectorize_features(df,v):
    data = df.copy()
    messages = list(data.message)
    train_vector_x = v.fit_transform(messages)
    return train_vector_x

In [16]:
def vectorize_test(df,v):
    data = df.copy()
    messages = list(data.message)
    train_vector_x = v.transform(messages)
    return train_vector_x

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

train_vectors_X = vectorize_features(df,v)
test_vectors_X = vectorize_test(test,v)

In [18]:
train_y = list(df.sentiment)

# 3. Model Training: Classification

### Fitting Model

In [19]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')

In [20]:
clf_svm.fit(train_vectors_X,train_y)

SVC(kernel='linear')

In [21]:
sentiment_pred = clf_svm.predict(test_vectors_X)
sentiment_pred

array([1, 1, 1, ..., 2, 0, 1])

In [22]:
# Predicted sentiment results
test['sentiment'] = list(sentiment_pred)
test.head()

Unnamed: 0,message,tweetid,sentiment
0,europe will now be looking to china to make su...,169760,1
1,combine this with the polling of staffers re c...,35326,1
2,the scary unimpeachable evidence that climate ...,224985,1
3,putin got to you too jill trump doesnt be...,476263,1
4,rt female orgasms cause global warmingsarcast...,872928,2


In [23]:
# Sample Solution results
ss.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1


In [24]:
from sklearn.metrics import f1_score
f1_score(list(ss.sentiment), list(sentiment_pred), average='macro')

0.17670141017780502

In [28]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_vectors_X,train_y)
y_pred = clf_log.predict(test_vectors_X)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [29]:
f1_score(list(ss.sentiment), list(y_pred), average='macro')

0.18525040291291114