# Downloading a dataset
It needs to be run only once.

In [None]:
import requests
import os
import zipfile
from tqdm import tqdm

url = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
data = os.path.join("data", "sentiment140")
filename = os.path.join("data", "sentiment140.zip")
if not os.path.isdir(data):
    print("Downloading...")
    os.makedirs("data", exist_ok=True)
    r = requests.get(url, allow_redirects=True)
    open(filename, 'wb').write(r.content)
    
    zip_ref = zipfile.ZipFile(filename, 'r')
    zip_ref.extractall(data)
    zip_ref.close()
    print("Done!")

# Loading the dataset

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np


cols = ['sentiment','id','date','query_string','user','text']
data = os.path.join("data", "sentiment140")
datasets = os.listdir(data)
train = pd.read_csv(os.path.join(data, datasets[1]), header=None, names=cols, encoding="ISO-8859-1")
test = pd.read_csv(os.path.join(data, datasets[0]), header=None, names=cols, encoding="ISO-8859-1")

In [None]:
test.head()

# Visualisation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer()
cvec.fit(train.text)
neg_doc_matrix = cvec.transform(train[train['sentiment'] == 0].text)
pos_doc_matrix = cvec.transform(train[train['sentiment'] == 4].text)
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,pos],
                            columns=cvec.get_feature_names()
                           ).transpose()
term_freq_df.columns = ['negative', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + \
                        term_freq_df['positive']
term_freq_df.sort_values(by='total', 
                         ascending=False
                        ).iloc[:10]

In [None]:
y_pos = np.arange(50)
plt.figure(figsize=(12,10))
plt.bar(y_pos, 
        term_freq_df.sort_values(by='negative',ascending=False)       
                                  ['negative'][:50], 
                                  align='center', 
                                  alpha=0.5)

plt.xticks(y_pos, 
           term_freq_df.sort_values(by='negative',ascending=False) 
                                     ['negative']   
                                     [:50].index,
                                     rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 negative tokens')
plt.title('Top 50 tokens in negative tweets')

We want to remove very frequent word such as "to". We use instead TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cvec = TfidfVectorizer()
cvec.fit(train.text)
neg_doc_matrix = cvec.transform(train[train['sentiment'] == 0].text)
pos_doc_matrix = cvec.transform(train[train['sentiment'] == 4].text)
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,pos],
                            columns=cvec.get_feature_names()
                           ).transpose()
term_freq_df.columns = ['negative', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + \
                        term_freq_df['positive']
term_freq_df.sort_values(by='total', 
                         ascending=False
                        ).iloc[:10]

In [None]:
y_pos = np.arange(50)
plt.figure(figsize=(12,10))
plt.bar(y_pos, 
        term_freq_df.sort_values(by='negative',ascending=False)       
                                  ['negative'][:50], 
                                  align='center', 
                                  alpha=0.5)

plt.xticks(y_pos, 
           term_freq_df.sort_values(by='negative',ascending=False) 
                                     ['negative']   
                                     [:50].index,
                                     rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 negative tokens')
plt.title('Top 50 tokens in negative tweets')

# Viewing cumulative distribution function
https://en.wikipedia.org/wiki/Cumulative_distribution_function

In [None]:
term_freq_df['pos_rate'] = term_freq_df['positive'] / term_freq_df['total']
term_freq_df.sort_values(by='pos_rate', 
                          ascending=False).iloc[:10]
term_freq_df['pos_freq_pct'] = term_freq_df['positive'] / term_freq_df['positive'].sum()
term_freq_df.sort_values(by='pos_freq_pct', 
                          ascending=False).iloc[:10]

from scipy.stats import hmean
term_freq_df['pos_hmean'] = term_freq_df.apply(
    lambda x: (hmean([x['pos_rate'], x['pos_freq_pct']])                                                                                             if x['pos_rate'] > 0 and 
                                  x['pos_freq_pct'] > 0 else 0), 
                       axis=1)
                                                       
term_freq_df.sort_values(by='pos_hmean', ascending=False).iloc[:10]


from scipy.stats import norm
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['pos_rate_normcdf'] = normcdf(term_freq_df['pos_rate'])
term_freq_df['pos_freq_pct_normcdf'] = normcdf(term_freq_df['pos_freq_pct'])
term_freq_df['pos_normcdf_hmean'] = hmean([term_freq_df['pos_rate_normcdf'], term_freq_df['pos_freq_pct_normcdf']])
term_freq_df.sort_values(by='pos_normcdf_hmean',ascending=False).iloc[:10]


term_freq_df['neg_rate'] = term_freq_df['negative'] * 1./term_freq_df['total']
term_freq_df['neg_freq_pct'] = term_freq_df['negative'] * 1./term_freq_df['negative'].sum()
term_freq_df['neg_hmean'] = term_freq_df.apply(lambda x: (hmean([x['neg_rate'], x['neg_freq_pct']])                                                                if x['neg_rate'] > 0 and x['neg_freq_pct'] > 0                                                                else 0), axis=1)
                                                       
term_freq_df['neg_rate_normcdf'] = normcdf(term_freq_df['neg_rate'])
term_freq_df['neg_freq_pct_normcdf'] = normcdf(term_freq_df['neg_freq_pct'])
term_freq_df['neg_normcdf_hmean'] = hmean([term_freq_df['neg_rate_normcdf'], term_freq_df['neg_freq_pct_normcdf']])
term_freq_df.sort_values(by='neg_normcdf_hmean', ascending=False).iloc[:10]

plt.figure(figsize=(8,6))
ax = sns.regplot(x="neg_normcdf_hmean", 
                 y="pos_normcdf_hmean",
                 fit_reg=False, 
                 scatter_kws={'alpha':0.5},
                 data=term_freq_df)
plt.ylabel('Positive Rate and Frequency CDF Harmonic Mean')
plt.xlabel('Negative Rate and Frequency CDF Harmonic Mean')
plt.title('neg_normcdf_hmean vs pos_normcdf_hmean')


In [None]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models import LinearColorMapper
from bokeh.models import HoverTool
output_notebook()
color_mapper = LinearColorMapper(palette='Inferno256', low=min(term_freq_df.pos_normcdf_hmean), high=max(term_freq_df.pos_normcdf_hmean))
p = figure(x_axis_label='neg_normcdf_hmean', y_axis_label='pos_normcdf_hmean')
p.circle('neg_normcdf_hmean','pos_normcdf_hmean',size=5,alpha=0.3,source=term_freq_df,color={'field': 'pos_normcdf_hmean', 'transform': color_mapper})
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)