In [205]:
import pandas as pd
import numpy as np
import preprocessor as p
import re as regex

from nltk.stem.porter import *
stemmer = PorterStemmer()


census = pd.read_csv('ACS_10_5YR_S1902_with_ann.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode', encoding='latin-1')
tweets = pd.read_csv('tweets.csv')

def remove_by_regex(tweets, regexp):
    tweets.loc[:, "tweet"].replace(regexp, "", inplace=True)
    return tweets

def clean(tweets):
    tweets['tweet'] = tweets['tweet'].apply(lambda x: p.clean(x))
    return tweets

def remove_special_chars(tweets):  # it unrolls the hashtags to normal words
    special_chars = [",", ":", "\"", "=", "&", ";", "%", "$",
                     "@", "%", "^", "*", "(", ")", "{", "}",
                     "[", "]", "|", "/", "\\", ">", "<", "-",
                     "!", "?", ".", "'",
                     "--", "---", "#"]
    special_chars = [",", "\"", "=", "&", ";", "%", "$",
                     "@", "%", "^", "*", "{", "}",
                     "[", "]", "|", "\\", ">", "<", "-",
                      ".", "'",
                     "--", "---", "#"]
    special_chars = ["!", "?", "@", "."]
    for remove in map(lambda r: regex.compile(regex.escape(r)), special_chars):
        tweets.loc[:, "tweet"].replace(remove, "", inplace=True)
    return tweets

def remove_usernames(tweets):
    return remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

def remove_numbers(tweets):
    return remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [234]:
def clean_tweets(t):
    t = t.dropna(subset=['tweet'])
    #t = clean(t)
    t = remove_special_chars(t)
    t = remove_usernames(t)
    #t = remove_numbers(t)
    return t

In [243]:
q1_tweets = pd.read_csv('q1_tweets.csv')
q4_tweets = pd.read_csv('q4_tweets.csv')
clean_q1 = clean_tweets(q1_tweets)
clean_q4 = clean_tweets(q4_tweets)

len(clean_q1)


54204

In [244]:
def get_tweet_stems(t):
    tweet_text = list(t['tweet'])
    tweet_text = [x.split(' ') for x in tweet_text]
    tweet_text = [[y.lower() for y in x] for x in tweet_text]

    #tweet_text = [[stemmer.stem(y.lower()) for y in x] for x in tweet_text]
    return tweet_text

q1_stems = get_tweet_stems(clean_q1)
q4_stems = get_tweet_stems(clean_q4)

In [245]:
import matplotlib.pyplot as plt
%matplotlib inline
jj = [len(x) for x in q1_stems]
long_q1_stems = [x for x in q1_stems if len(x) > 6]
long_q4_stems = [x for x in q4_stems if len(x) > 6]

In [246]:
from textblob import TextBlob

def get_polarities(text_list):
    return np.array([s.sentiment.polarity for s in TextBlob(text_list).sentences])

def create_block(text_list):
    return '. '.join([' '.join(y) for y in text_list])

q1_happy = [y for y in q1_stems if ':)' in y]
q4_happy = [y for y in q4_stems if ':)' in y]
q1_sad = [y for y in q1_stems if ':(' in y]
q4_sad = [y for y in q4_stems if ':(' in y]

print(len(q1_happy), len(q4_happy), len(q1_sad), len(q4_sad))
n1_happy, n4_happy, n1_sad, n4_sad = (len(q1_happy), len(q4_happy), len(q1_sad), len(q4_sad))
q1_happy_block = create_block(q1_happy)
q4_happy_block = create_block(q4_happy)
q1_sad_block = create_block(q1_sad)
q4_sad_block = create_block(q4_sad)


print(np.mean(get_polarities(q1_happy_block)))
print(np.mean(get_polarities(q4_happy_block)))

print(np.mean(get_polarities(q1_sad_block)))
print(np.mean(get_polarities(q4_sad_block)))
print('')
print(np.sum(get_polarities(q1_happy_block) > 0)/float(n1_happy))
print(np.sum(get_polarities(q4_happy_block) > 0)/float(n4_happy))

print('')
print(np.sum(get_polarities(q1_sad_block) < 0)/float(n1_sad))
print(np.sum(get_polarities(q4_sad_block) < 0)/float(n4_sad))

1320 886 324 234
0.427219202734
0.43679576255
-0.465829018474
-0.486888905847

0.94696969697
0.958239277652

0.867283950617
0.910256410256


## q1_tweets

In [281]:
flat_q1 = [item for sublist in q1_stems for item in sublist]
flat_q4 = [item for sublist in q4_stems for item in sublist]
unique_words = np.unique(flat_q1 + flat_q4)
vector_size = len(all_words)

In [276]:
def map_to_bow(sentence, all_words):
    vec = np.zeros((len(all_words)))
    for w in sentence:
        if w in skip_words:
            continue
        vec[np.where(all_words == w)[0]] += 1
    return vec

In [280]:
q1_bow = np.array([map_to_bow(s, all_words) for s in q1_stems])
q4_bow = np.array([map_to_bow(s, all_words) for s in q4_stems])

17.0


KeyboardInterrupt: 

In [293]:
all_words = np.array(flat_q1 + flat_q4)
(values,counts) = np.unique(all_words,return_counts=True)


array([ 57155, 148740,  60109,  85391,  54428,  16645,  71855,  82996,
        81545,  47425])

In [298]:
skip_words = values[np.argsort(counts)[-15:]]