# Sentiment Analysis for Existance of Global Warming

Training dataset: 
- Sentiment Analysis – Global Warming/Climate Change
-  https://www.figure-eight.com/data-for-everyone/ 
- Contributors evaluated tweets for belief in the existence of global warming or climate change. The possible answers were “Yes” if the tweet suggests global warming is occurring, “No” if the tweet suggests global warming is not occurring, and “I can’t tell” if the tweet is ambiguous or unrelated to global warming. We also provide a confidence score for the classification of each tweet.

All codes inspired by Vicky Qian: https://gist.github.com/vickyqian

In [96]:
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import copy
from wordcloud import WordCloud, STOPWORDS

In [33]:
df = pd.read_csv("earthday.csv",header=None)

In [35]:
df.rename(columns={0:"Time", 1:"Content"}, inplace=True)
df.head()

Unnamed: 0,Time,Content
0,2019-04-23 04:15:15,b'#EarthDay19 remember that we\xe2\x80\x99re p...
1,2019-04-23 04:15:12,b'RT @NatGeo: \xe2\x80\x9cThe countries that a...
2,2019-04-23 04:15:12,"b""We might die, and I'm not going to lie to yo..."
3,2019-04-23 04:15:10,"b'RT @SanjayBajpai65: In the next 30 yrs, thre..."
4,2019-04-23 04:14:38,b'The amount of shade being thrown on this Ear...


In [36]:
df.shape

(33443, 2)

In [37]:
import csv

def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
    # csv.py doesn't do Unicode; encode temporarily as UTF-8:
    csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
                            dialect=dialect, **kwargs)
    for row in csv_reader:
        # decode UTF-8 back to Unicode, cell by cell:
        yield [unicode(cell, 'utf-8') for cell in row]

def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.encode('utf-8')

In [38]:
train = pd.read_csv("global_warming_train.csv", encoding = 'unicode_escape')

In [39]:
train.shape

(6089, 2)

In [40]:
###Preprocess tweets
def processTweet2(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet    

In [41]:
###get stopword list
def getStopWordList(stopWordListFileName):
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

In [42]:
stopWords = []

st = open('stopword.txt', 'r')
stopWords = getStopWordList('stopword.txt')


def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

In [43]:
def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

In [44]:
import re
re.compile('<title>(.*)</title>')

re.compile(r'<title>(.*)</title>', re.UNICODE)

In [45]:
###load gb sentiment training data 
    
gbtrain = pd.read_csv("global_warming_train.csv", encoding ="ISO-8859-1")
tweets = []
featureList = []
for i in range(len(gbtrain)):
    sentiment = gbtrain['existence'][i]
    tweet = gbtrain['tweet'][i]
    processedTweet = processTweet2(tweet)
    featureVector = getFeatureVector(processedTweet)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment))

In [46]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [47]:
### Remove featureList duplicates
featureList = list(set(featureList))

In [48]:
import nltk
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the classifier Naive Bayes Classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
#df is a dataframe containing all the tweets
df['sentiment'] = df['Content'].apply(lambda tweet: NBClassifier.classify(extract_features(getFeatureVector(processTweet2(tweet)))))

In [49]:
df

Unnamed: 0,Time,Content,sentiment
0,2019-04-23 04:15:15,b'#EarthDay19 remember that we\xe2\x80\x99re p...,Y
1,2019-04-23 04:15:12,b'RT @NatGeo: \xe2\x80\x9cThe countries that a...,Y
2,2019-04-23 04:15:12,"b""We might die, and I'm not going to lie to yo...",N
3,2019-04-23 04:15:10,"b'RT @SanjayBajpai65: In the next 30 yrs, thre...",Y
4,2019-04-23 04:14:38,b'The amount of shade being thrown on this Ear...,Y
5,2019-04-23 04:14:31,b'RT @SteveSGoddard: @JonathanFranki3 @luvkit ...,N
6,2019-04-23 04:14:19,b'@ChandlerLovric @40Sauce @realCHVSE @lildick...,Y
7,2019-04-23 04:14:18,b'RT @SteveSGoddard: @JonathanFranki3 @luvkit ...,N
8,2019-04-23 04:14:05,b'More damned blush violets for coral reptiles...,N
9,2019-04-23 04:13:52,b'@charliekirk11 @realDonaldTrump Beautiful. G...,N


In [50]:
df.to_csv("earthday_prediction.csv")

In [54]:
df[df["sentiment"]=="Y"].shape

(22845, 3)

In [55]:
df[df["sentiment"]=="N"].shape

(8308, 3)

In [56]:
df[df["sentiment"]=="Yes"].shape

(65, 3)

In [63]:
df["sentiment"].isna().sum()

2225

In [64]:
df["sentiment"] = df["sentiment"].replace({"Yes":"Y"})

In [68]:
df.isna().sum()

Time            0
Content         0
sentiment    2225
dtype: int64

In [70]:
df.dropna(inplace=True)

In [71]:
df.shape

(31218, 3)

In [100]:
df.head()

Unnamed: 0,Time,Content,sentiment,Date
0,04:15:15,b'#EarthDay19 remember that we\xe2\x80\x99re p...,Y,2019-04-23
1,04:15:12,b'RT @NatGeo: \xe2\x80\x9cThe countries that a...,Y,2019-04-23
2,04:15:12,"b""We might die, and I'm not going to lie to yo...",N,2019-04-23
3,04:15:10,"b'RT @SanjayBajpai65: In the next 30 yrs, thre...",Y,2019-04-23
4,04:14:38,b'The amount of shade being thrown on this Ear...,Y,2019-04-23


In [83]:
df["Date"], df["Time"] = df["Time"].str.split(' ').str

In [91]:
earthday = df[df["Date"] == "2019-04-22"]

In [92]:
earthday.reset_index(inplace=True, drop = True)

In [97]:
earthday_final = copy.deepcopy(earthday)

In [98]:
earthday_final = earthday_final[["Time", "sentiment"]]

In [101]:
earthday_final.to_csv("twitter_earthday.csv")