In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from spacy.en.word_sets import STOP_WORDS
from sklearn.preprocessing import scale, Normalizer

plt.style.use('fivethirtyeight')

In [5]:
df = pd.read_pickle('../pickle_files/df3.p')

In [6]:
mask = df['compound'] <= -0.4
negative_rows = df.loc[mask] 
mask = df['compound'] >= 0.7
positive_rows = df.loc[mask]
mask = (df['compound'] > -0.4) & (df['compound'] < 0.7)
neutral_rows = df.loc[mask]

In [4]:
print('negative tweets:', len(negative_rows))
print('neutral tweets:', len(neutral_rows))
print('positive_tweets:', len(positive_rows))      

negative tweets: 10435
neutral tweets: 44953
positive_tweets: 6323


In [301]:
negative_rows.to_csv('../csvs/negative_rows.csv')
positive_rows.to_csv('../csvs/positive_rows.csv')

In [3]:
pd.read_csv('../csvs/negative_rows.csv', index_col='Unnamed: 0')

Unnamed: 0,Date,handle,id,language,tweet,tuples,tweets_clean,positive,negative,neutral,compound,United,Delta,Southwest,American,tweets_clean1,time,hashtags,Number of hashtags,lemmatized_tweets
3,2017-11-04,loooorenanicole,59fd32ff291ebbf619c97777,en,@united I'm SO DISAPPOINTED in your cust suppo...,"('support', 'in', 'shame', 'so', 'im', 'on', '...",im so disappointed in your cust support shame...,0.145,0.427,0.429,-0.7398,1.0,0.0,0.0,0.0,i am disappointed customer support shame,03:22:11,,0,disappointed customer support shame
8,2017-11-04,_MisEllis_,59fd32ff291ebbf619c9777e,en,@united This is just a gist of that debacle. B...,"('just', 'this', 'is', 'but', 'that', 'truly',...",this is just a gist of that debacle but ultim...,0.112,0.273,0.615,-0.6369,1.0,0.0,0.0,0.0,gist debacle uimatelyi truly disappointed infu...,03:17:06,,0,gist debacle uimatelyi truly disappoint infuri...
14,2017-11-04,_MisEllis_,59fd32ff291ebbf619c97786,en,"@united I missed multiple exams, appointments ...","('injury', 'missed', 'exams', 'multiple', 'we'...",i missed multiple exams appointments and job ...,0.093,0.377,0.530,-0.8225,1.0,0.0,0.0,0.0,missed muiple exams appointments job opportuni...,03:12:25,,0,miss muiple exam appointment job opportunity a...
17,2017-11-04,loooorenanicole,59fd32ff291ebbf619c9778b,en,@united I'm really frustrated with your custom...,"('perks', 'frustrated', 'with', 'supports', 'u...",im really frustrated with your customer suppo...,0.113,0.242,0.645,-0.4927,1.0,0.0,0.0,0.0,i am really frustrated customer supports appar...,03:07:26,,0,frustrated customer support apparent lack unde...
24,2017-11-04,ItsKittyBabee,59fd32ff291ebbf619c97793,en,@united Need some extra boarding time? You're ...,"('extra', 'one', 'like', 'ur', 'last', 'take',...",need some extra boarding time youre the last ...,0.094,0.264,0.642,-0.6705,1.0,0.0,0.0,0.0,need extra boarding time you are last one plan...,03:03:15,,0,ne extra boarding time plane time like punishm...
25,2017-11-04,sonja_travels,59fd32ff291ebbf619c97795,en,So very disappointed in @united’ handling of r...,"('with', 'amp', 'constant', 'of', 'had', 'in',...",so very disappointed in handling of racial ex...,0.000,0.246,0.754,-0.7070,1.0,0.0,0.0,0.0,disappointed handling racial exchange gate age...,03:02:58,,0,disappointed handle racial exchange gate agent...
29,2017-11-04,Chris_Chuter,59fd32ff291ebbf619c9779b,en,Had to throw away stand up poster due to arbit...,"('rules', 'up', 'had', 'airports', 'from', 'se...",had to throw away stand up poster due to arbi...,0.097,0.219,0.685,-0.6166,1.0,0.0,0.0,0.0,throw away stand poster due arbitrary rules ai...,02:59:02,,0,throw away stand poster arbitrary rule airport...
32,2017-11-04,BramKincheloe,59fd32ff291ebbf619c9779f,en,@united So late that we missed our next flight...,"('now', 'forced', 'us', 'put', 'so', 'flight',...",so late that we missed our next flight you pu...,0.000,0.161,0.839,-0.6369,1.0,0.0,0.0,0.0,late missed next flight put us new flight sfo ...,02:56:52,,0,late miss flight new flight sfo plane decomiss...
37,2017-11-04,deetothab,59fd32ff291ebbf619c977a9,en,@EdB_SP @united you ruined it for everybody,"('ruined', 'for', 'it', 'everybody', 'you')",you ruined it for everybody,0.000,0.341,0.659,-0.4767,1.0,0.0,0.0,0.0,ruined everybody,02:40:21,,0,ruin everybody
57,2017-11-04,AAforFun,59fd32ff291ebbf619c977cc,en,@simnett @united Sadly in an oligopolistic mar...,"('oligopolistic', 'sadly', 'market', 'an', 'ar...",sadly in an oligopolistic market consumers ar...,0.139,0.315,0.545,-0.4404,1.0,0.0,0.0,0.0,sadly oligopolistic market consumers inconveni...,02:09:22,,0,sadly oligopolistic market consumer inconvenie...


In [7]:
label = ['negative rows', 'neutral rows', 'positive rows']
num_rows = [len(negative_rows), len(neutral_rows), len(positive_rows)]
for l, n in zip(label, num_rows):
    print('{0}: {1}'.format(l, n))

negative rows: 10435
neutral rows: 44953
positive rows: 6323


In [8]:
df.columns

Index(['Date', 'handle', 'id', 'language', 'tweet', 'tuples', 'tweets_clean',
       'positive', 'negative', 'neutral', 'compound', 'United', 'Delta',
       'Southwest', 'American', 'tweets_clean1', 'time', 'hashtags',
       'Number of hashtags', 'lemmatized_tweets'],
      dtype='object')

In [9]:
united_neg = negative_rows.loc[negative_rows['United'] == 1]
delta_neg = negative_rows.loc[negative_rows['Delta'] == 1]
sw_neg = negative_rows.loc[negative_rows['Southwest'] == 1]
aa_neg = negative_rows.loc[negative_rows['American'] == 1]

united_pos = positive_rows.loc[positive_rows['United'] == 1]
delta_pos = positive_rows.loc[positive_rows['Delta'] == 1]
sw_pos = positive_rows.loc[positive_rows['Southwest'] == 1]
aa_pos =  positive_rows.loc[positive_rows['American'] == 1]

In [10]:
airlines = ['United', 'Delta', 'Southwest', 'American']
neg_dfs = [united_neg, delta_neg, sw_neg, aa_neg]
pos_dfs = [united_pos, delta_pos, sw_pos, aa_pos]

In [11]:
stop = STOP_WORDS.union({'flight', 'amp', 'im', 'fly', 'flights'})
airline_stopwords = {'United': {'united'}, 
                  'Delta': {'delta', 'dea'},
                  'Southwest' : {'southwest'},
                  'American' : {'american', 'aa'}, 
                   }
all_stop = stop.union({'united', 'delta', 'dea', 'southwest', 'american', 
                       'aa', 'flight', 'amp', 'im', 'fly',
                      'good', 'thanks', 'great', 'thank', 'best', 
                       'like', 'awesome', 'today', 'airlines', 'better'})

#### SVD then clustering

In [15]:
def get_clusters_tfidf(airline, df, col, true_k = 5, n_components = 10, verbose = True, random_state = 24):
    vect = TfidfVectorizer(tokenizer = None,
                               max_df = 0.5, 
                               max_features = 200000, 
                               stop_words = all_stop,
                               use_idf = True, 
                               ngram_range=(1, 3),
                               min_df = 3)
    matrix = vect.fit_transform(df[col])
    grams = vect.get_feature_names()
    
    # do svd
    svd = TruncatedSVD(n_components = n_components, random_state = random_state)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    reduced_matrix = lsa.fit_transform(matrix)
    
    # do kmeans
    km = KMeans(n_clusters = true_k, random_state = 24)
    km.fit(reduced_matrix)
    clusters = km.labels_.tolist()
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    
    if verbose:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
        print('\n', airline)
        for i in range(true_k):
            print("\nCluster %d:" % i)
            print(', '.join([grams[ind] for ind in order_centroids[i, :25]]))
    return km


In [16]:
kms_pos_tfidf_dict = {}
for airline, df in zip(airlines, pos_dfs):
    kms_pos_tfidf_dict[airline] = get_clusters_tfidf(airline, df, 'tweets_clean1', n_components = 10, 
                                                     true_k = 5, random_state = 42)
    df['cluster_num'] = kms_pos_tfidf_dict[airline].labels_ # append cluster number to df


 United

Cluster 0:
nice, love, trip, pretty, enjoy, plane, free, travel, amazing, flying, cool, ok, wow, know, got, day, happy, seat, glad, crew, thats, new, experience, class, work

Cluster 1:
amazing, crew, beautiful, service, yes, pretty, aircraft, story, love, congrats, people, home, day, wonderful, know, sfo, wow, flying, experience, pretty amazing, hours, lax, plane, getting, super

Cluster 2:
free, wow, hope, luck, yes, safe, day, home, fun, houston, time, getting, upgrade, travels, crew, safe travels, experience, plane, friend, watch, service, tomorrow, love, amazing, flying

Cluster 3:
help, home, seat, yes, happy, want, got, nice, need, customer, trip, love, hoping, miles, support, number, care, appreciated, hope, gate, far, time, times, ticket, upgrade

Cluster 4:
service, customer, customer service, agent, loyay, morning, crew, wonderful, lax, amazing, way, care, fantastic, day, travel, experience, gate, super, home, excellent, helpful, happy, phone, helping, representati

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



 Delta

Cluster 0:
time, home, free, know, happy, nice, getting, sure, flying, pretty, yes, got, gate, helping, help, comfort, love, crew, wifi, wow, upgrade, people, week, way, want

Cluster 1:
amazing, crew, hope, morning, gate, day, experience, safe, trip, helpful, agent, travel, job, guys, airport, care, home, beautiful, nice, making, flying, gate agent, got, super, way

Cluster 2:
service, customer, customer service, amazing, crew, excellent, gate, guys, support, agent, amazing customer, excellent service, kudos, amazing customer service, job, helpful, airline, wonderful, experience, work, staff, time, got, friendly, customer support

Cluster 3:
help, need, need help, direct, message, appreciated, direct message, way, hope, support, know, change, appreciate, twitter, thats, guys, lol, yes, number, trip, let, experience, time, love, got

Cluster 4:
love, love love, airline, time, flying, enjoy, people, new, guys, beautiful, travel, nice, fan, love flying, attendants, know, got, wo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



 Southwest

Cluster 0:
free, drink, halloween, drinks, free drink, free drinks, wifi, free wifi, bags, happy, day, coupons, happy halloween, love, airline, hey, tickets, hope, let, nice, drink coupons, know, lol, bags free, snacks

Cluster 1:
lol, crew, fun, time, amazing, flying, airline, nice, way, home, morning, favorite, got, thats, experience, happy, day, super, know, people, wow, travel, hope, southwestair, plane

Cluster 2:
new, safe, help, guys, home, getting, plane, day, amazing, hope, travel, wow, happy, trip, crew, nice, know, sure, gate, super, southwestair, got, need, travels, friend

Cluster 3:
love, flying, love flying, new, time, field, love field, fun, love love, wow, funny, lol, haha, dallas, terminal, thats, love guys, swa, dallas love, guys, nice, commercial, way, job, safe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Cluster 4:
service, customer, customer service, amazing, travel, fantastic, got, guys, excellent, helping, sw, staff, found, want, wow, crew, help, love, helpful, kind, amazing customer, amazing customer service, bag, oakland, people

 American

Cluster 0:
home, safe, time, nice, way, got, job, pretty, flying, fun, making, trip, know, plane, glad, day, travels, lol, free, wonderful, gate, safe travels, wow, beautiful, seat

Cluster 1:
service, customer, customer service, excellent, crew, experience, amazing customer service, amazing customer, amazing, helpful, outstanding, loyal, got, flying, americanairlines, miles, free, making, help, outstanding customer service, outstanding customer, time, nice, team, getting

Cluster 2:
hope, wow, crew, free, happy, travel, amazing, job, sure, nice, way, guys, time, lol, air, know, making, service, got, care, flying, day, fantastic, wifi, yes

Cluster 3:
help, appreciate, time, care, got, need, plane, appreciate help, sure, know, hope, gate, help

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
kms_neg_tfidf_dict = {}
for airline, df in zip(airlines, neg_dfs):
    kms_neg_tfidf_dict[airline] = get_clusters_tfidf(airline, df, 'tweets_clean1', n_components = 10, 
                                                     true_k = 5, random_state = 42)
    df['cluster_num'] = kms_neg_tfidf_dict[airline].labels_ # append cluster number to df


 United

Cluster 0:
delayed, plane, delay, hour, hours, gate, hour delay, going, sitting, got, minutes, time, wait, min, broken, experience, missed, delayed hours, horrible, seat, miss, cancelled, ridiculous, sfo, ewr

Cluster 1:
time, bad, fuck, sad, suck, shit, gate, sorry, flying, experience, airline, cancelled, guys, disappointed, awful, plane, problem, check, know, sucks, try, boarding, miss, got, wrong

Cluster 2:
service, customer, customer service, worst customer, worst customer service, terrible, horrible, terrible customer service, terrible customer, flying, worst, seat, airline, absolutely, thats, bad, complaint, poor, guys, unacceptable, phone, customers, gate, rude, got

Cluster 3:
seat, flying, plane, check, gate, bag, horrible, hate, economy, worse, experience, carry, hell, time, seats, rude, pay, basic, wrong, basic economy, terrible, know, thats, got, lost

Cluster 4:
worst, airline, worst airline, flying, experience, hate, absolute, seat, worst customer, absolute wor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



 Delta

Cluster 0:
time, bad, problem, flying, hate, thats, experience, hours, worst, disappointed, airline, need, bag, day, change, going, horrible, sorry, trying, error, know, got, wait, let, check

Cluster 1:
bomb, fuck, plane, bag, got, shit, gate, terrible, problem, bad, lost, let, going, know, need, wrong, luggage, guys, flying, check, experience, day, said, hey, head

Cluster 2:
service, customer, customer service, disappointed, terrible, rude, horrible, experience, worst, hold, worst customer, worst customer service, hour, delay, time, poor, phone, hours, wait, fuck, bad, minutes, response, worse, thats

Cluster 3:
delayed, hours, delay, plane, hour, delayed hours, stuck, connection, gate, atl, hr, minutes, missed, crew, missing, unacceptable, miss, late, waiting, airport, hour delay, home, broken, sitting, jfk

Cluster 4:
seat, seats, plane, broken, gate, worse, thats, got, paid, class, shame, upgrade, damn, terrible, people, rude, ridiculous, middle, change, flying, pay, min

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



 Southwest

Cluster 0:
bad, delay, worst, idea, experience, hell, hour, terrible, hour delay, gate, plane, airline, day, travel, way, disappointed, flying, terrible idea, bad experience, want, worst experience, awful, weather, hours, time

Cluster 1:
delayed, hours, delayed hours, plane, hey, got, delay, new, cancelled, time, know, problem, guys, gate, hour, connection, broken, bag, worst, bad, home, waiting, sad, lost, airline

Cluster 2:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



time, music, people, live, flying, hate, live music, want, fuck, airline, shit, listen, know, forced, going, way, think, got, bad, right, worse, sad, passengers, error, stop

Cluster 3:
service, customer, customer service, disappointed, horrible, terrible, worst, horrible customer service, horrible customer, know, rude, disappointing, wrong, going, time, lost, people, gate, poor, got, plane, disappointed customer, worst customer, worst customer service, disappointed customer service

Cluster 4:
wrong, plane, know, got, gate, error, problem, check, getting, bad, time, right, boarding, minutes, guys, people, let, bag, thats, sorry, hours, sad, trying, think, book

 American

Cluster 0:
service, worst, customer, customer service, shit, airline, flying, bad, fuck, worst airline, time, worst customer, worst customer service, experience, rude, poor, terrible, horrible, air, gate, people, staff, poor customer, poor customer service, plane

Cluster 1:
unacceptable, airline, know, hate, thats,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
kmeans_neg_american = kms_neg_tfidf_dict['American']
pd.Series(kmeans_neg_american.labels_).value_counts()

1    903
3    847
2    763
4    756
0    665
dtype: int64

#### Map the clusters to the rows

In [432]:
rows_to_keep = ['Date', 'time', 'handle', 'tweet', 'compound', 'American', 'Delta', 'Southwest', 'United',
               'tweets_clean1', 'hashtags', 'cluster_num']
neg_dfs1 = []
for df in neg_dfs:
    df1= df.copy()
    df1 = df1[rows_to_keep]
    neg_dfs1.append(df1)

In [433]:
united_neg1 = neg_dfs1[0]
delta_neg1 = neg_dfs1[1]
sw_neg1 = neg_dfs1[2]
aa_neg1 = neg_dfs1[3]

In [434]:
for df in neg_dfs1:
    print(df.shape)

(2012, 12)
(2871, 12)
(2036, 12)
(4070, 12)


In [435]:
aa_neg1.to_csv('../csvs/aa_neg.csv')
delta_neg1.to_csv('../csvs/delta_neg.csv')
sw_neg1.to_csv('../csvs/sw_neg.csv')
united_neg1.to_csv('../csvs/united_neg.csv')

In [436]:
pos_dfs1 = []
for df in pos_dfs:
    df1= df.copy()
    df1 = df1[rows_to_keep]
    pos_dfs1.append(df1)

In [437]:
united_pos1 = pos_dfs1[0]
delta_pos1 = pos_dfs1[1]
sw_pos1 = pos_dfs1[2]
aa_pos1 = pos_dfs1[3]

In [438]:
for df in pos_dfs1:
    print(df.shape)

(1347, 12)
(2036, 12)
(2014, 12)
(1757, 12)


In [439]:
aa_pos1.to_csv('../csvs/aa_pos.csv')
delta_pos1.to_csv('../csvs/delta_pos.csv')
sw_pos1.to_csv('../csvs/sw_pos.csv')
united_pos1.to_csv('../csvs/united_pos.csv')