## Data Prep

In [38]:
import json
import numpy as np
import pandas as pd

import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from textblob import TextBlob
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


import preprocessor as p # 
import string # use string.punctuation to clean out punctuation

from pprint import pprint

from datetime import datetime
import pickle



from tqdm import tqdm

pd.set_option('display.max_colwidth', -1)

In [3]:
# Export the MongoDB collection to json format and transfer from AWS EC2 to localhost
# mongoexport -d climatechange -c climate_tweets -o tweets.json
# scp myaws:tweets.json .
# Read the data into an array

tweets_data = []
tweets_file = open('tweets.json','r')
for line in tweets_file:

    tweet = json.loads(line)
    tweets_data.append(tweet)
    

In [4]:
print(len(tweets_data))

17579


### Load Tweets into a DataFrame

In [5]:
tweetsDF = pd.DataFrame()
tweetsDF['user'] = [tweet['user']['screen_name'] for tweet in tweets_data]
tweetsDF['created_at'] = [datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y') for tweet in tweets_data] 

In [6]:
text = []

for tweet in tqdm(tweets_data):
    try: 
        text.append(tweet['retweeted_status']['full_text'])
    except KeyError:       
        text.append(tweet['full_text'])
        
tweetsDF['full_text'] = text

100%|██████████| 17579/17579 [00:00<00:00, 733495.19it/s]


In [7]:
tweetsDF['truncated'] = [tweet['truncated'] for tweet in tweets_data]
tweetsDF['contains_url'] = tweetsDF['full_text'].str.contains('https', regex=True)*1
tweetsDF['favorite_count'] = [tweet['favorite_count'] for tweet in tweets_data]
tweetsDF['location'] = [tweet['place']['country'] if tweet['place'] != None else None 
                        for tweet in tweets_data]
tweetsDF['time_zone'] = [tweet['user']['time_zone'] for tweet in tweets_data]
tweetsDF['following_count'] = [tweet['user']['friends_count'] for tweet in tweets_data]
tweetsDF['followers_count'] = [tweet['user']['followers_count'] for tweet in tweets_data]
tweetsDF['retweet_count'] = [tweet['retweet_count'] for tweet in tweets_data]

In [8]:
tweetsDF.dtypes

user               object             
created_at         datetime64[ns, UTC]
full_text          object             
truncated          bool               
contains_url       int64              
favorite_count     int64              
location           object             
time_zone          object             
following_count    int64              
followers_count    int64              
retweet_count      int64              
dtype: object

## Clean Tweets

In [9]:
tweetsDF.full_text.replace(regex=True, inplace = True, to_replace = r'#', value = r'') # Remove #-signs, but keep words from hashtag

In [10]:
cleaned_tweets = []

for tweet in tweetsDF.full_text:
    cleaned_tweets.append(p.clean(tweet))
    
tweetsDF['full_text'] = cleaned_tweets

In [11]:
tweetsDF.full_text.replace(regex=True, inplace = True, to_replace = r'&amp;', value = r'and') # Remove ampersand code

In [62]:
tweets = []
for tweet in tweetsDF.full_text:
    letters_only = re.sub("[^a-zA-Z]", " ", tweet) 
    words = ' '.join(letters_only.lower().split())
    tweets.append(words)
tweetsDF['full_text'] = tweets

## Sentiment Analysis

In [63]:
bloblist = []

for tweet in tweetsDF.full_text:
    blob = TextBlob(tweet)
    bloblist.append(blob)

In [64]:
sentlist = []
for blob in bloblist:
    bs = blob.sentiment
    sentlist.append(bs)

sentDF = pd.DataFrame()
sentDF['sentiment'] = sentlist

In [65]:
sentDF = sentDF.sentiment.apply(pd.Series)
sentDF.rename(columns={0: 'polarity', 1: 'subjectivity'}, inplace=True)
print(sentDF.polarity.mean())
print(sentDF.subjectivity.mean())

0.04950138866188848
0.3255819384331037


In [66]:
tweetsDF['Polarity'] = sentDF.polarity
tweetsDF['Subjectivity'] = sentDF.subjectivity

In [67]:
tweetsDF

Unnamed: 0,user,created_at,full_text,truncated,contains_url,favorite_count,location,time_zone,following_count,followers_count,retweet_count,Polarity,Subjectivity
0,MartinKACrook,2018-02-28 22:32:24+00:00,population control will not reverse or mitigate climate change here is the logic and evidence populationcontrollaw,False,1,0,,Amsterdam,94,71,0,0.000000,0.000000
1,AggieHJ,2018-02-28 22:32:06+00:00,i see climate change deniers are out in force today due to the cold weather in europe they still haven t bothered to research the issue properly and are still hung up on the global warming meaning climatechangeisreal,False,1,0,,,218,307,0,-0.181250,0.368750
2,MadonnaMadsen,2018-02-28 22:32:00+00:00,convenient truth al gore the father of global warming suggest that we tax ranchers cows passing gas he owns mansion flies around in his personal jet and leaves a carbon footprint like a herd of elephants now the dems switch the name climate change what a hoax,False,1,0,,Central Time (US & Canada),10202,10713,3,0.000000,0.150000
3,hackneywick,2018-02-28 22:31:58+00:00,and yes it s consistent with climate change warming at the poles weakens the jetstream which blows warm air from the west and allows colder air in from the east,False,1,0,,London,7031,8057,2,0.425000,0.425000
4,ONGYEWKHOON5,2018-02-28 22:31:19+00:00,can lee s l do something to safe the global warming and climate changes,False,1,0,Singapore,,3896,806,0,0.250000,0.250000
5,munkihanger,2018-02-28 22:31:16+00:00,john ashton trying to elain to mayhem and co in necessarily very simple terms that allowing fracking won t help reduce global warming and catastrophic climate breakdown wonder if they ll get the message best keep on with nvda until they do roll on the united resistance,False,1,0,,,222,346,4,0.333333,0.254762
6,tlane04,2018-02-28 22:30:31+00:00,it might seem counterintuitive but global warming plays a role in blasts of bitter cold weather the reason it influences the jet stream here s how,False,1,0,,,474,56,153,-0.233333,0.500000
7,rcjhawk86,2018-02-28 22:29:50+00:00,convenient truth al gore the father of global warming suggest that we tax ranchers cows passing gas he owns mansion flies around in his personal jet and leaves a carbon footprint like a herd of elephants now the dems switch the name climate change what a hoax,False,1,0,,,6380,6295,3,0.000000,0.150000
8,bestdoxiemom,2018-02-28 22:29:47+00:00,convenient truth al gore the father of global warming suggest that we tax ranchers cows passing gas he owns mansion flies around in his personal jet and leaves a carbon footprint like a herd of elephants now the dems switch the name climate change what a hoax,False,1,0,,,4991,4951,3,0.000000,0.150000
9,yergnflergn,2018-02-28 22:29:34+00:00,if you want to research the soyboy epidemic causes just allege a link and possible solution to global warming sorry climate change in the research grant application maybe study the combined potential of estrogen and soy flatulence to absorb co and bam the money s in the bank,False,0,0,,,313,197,0,-0.125000,0.750000


## Location

In [13]:
pd.unique(tweetsDF.location)

array([None, 'Singapore', 'Australia', 'United Kingdom', 'India',
       'Canada', 'United States', 'Kenya', 'Indonesia', 'Malaysia',
       'Mexico', 'Spain', 'Thailand', 'Austria', 'Ireland', 'Denmark',
       'Greece', 'Mali', 'Norway', 'Japan', 'Finland', 'New Zealand',
       'Fiji'], dtype=object)

In [14]:
pd.unique(tweetsDF.time_zone)

array(['Amsterdam', None, 'Central Time (US & Canada)', 'London',
       'Mountain Time (US & Canada)', 'Bern', 'Alaska', 'Baghdad',
       'Sydney', 'Casablanca', 'Eastern Time (US & Canada)', 'Athens',
       'Pacific Time (US & Canada)', 'America/Chicago', 'Nairobi',
       'Quito', 'Bogota', 'Europe/London', 'Brasilia', 'Arizona',
       'Midway Island', 'Kabul', 'Brisbane', 'Helsinki', 'Wellington',
       'Edinburgh', 'Atlantic Time (Canada)', 'Dublin', 'Pretoria',
       'Paris', 'Copenhagen', 'Melbourne', 'Stockholm', 'Africa/Nairobi',
       'Berlin', 'Belgrade', 'Canberra', 'Bucharest', "Nuku'alofa",
       'America/Los_Angeles', 'Monterrey', 'Madrid', 'Hawaii', 'Warsaw',
       'Brussels', 'Bangkok', 'Hanoi', 'Greenland', 'Vienna',
       'Europe/Madrid', 'Kuala Lumpur', 'Karachi', 'Kyiv', 'Buenos Aires',
       'Tbilisi', 'New Delhi', 'Mid-Atlantic', 'Prague',
       'America/Toronto', 'PST', 'Singapore', 'Perth', 'Budapest',
       'Santiago', 'America/New_York', 'Chennai'

In [15]:
sum(tweetsDF.location.value_counts())

229

In [16]:
sum(tweetsDF.time_zone.value_counts())

10438

In [68]:
with open('clean_tweets.pkl', 'wb') as picklefile: # wb: write, binary
    pickle.dump(tweetsDF, picklefile) #dump data into pickle file