In [52]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from spacy.en.word_sets import STOP_WORDS
from sklearn.preprocessing import scale, Normalizer
from sklearn.pipeline import make_pipeline

plt.style.use('fivethirtyeight')

In [162]:
airlines = ['American', 'Delta', 'Southwest', 'United']
aa_neg = pd.read_csv('../csvs/aa_neg.csv', index_col = 'Unnamed: 0')
delta_neg = pd.read_csv('../csvs/delta_neg.csv', index_col = 'Unnamed: 0')
sw_neg = pd.read_csv('../csvs/sw_neg.csv', index_col = 'Unnamed: 0')
united_neg = pd.read_csv('../csvs/united_neg.csv', index_col = 'Unnamed: 0')
dfs_neg = [aa_neg, delta_neg, sw_neg, united_neg]

In [163]:
for df in dfs_neg:
    print(df.shape)

(4070, 12)
(2871, 12)
(2036, 12)
(2012, 12)


In [164]:
aa_pos = pd.read_csv('../csvs/aa_pos.csv', index_col = 'Unnamed: 0')
delta_pos = pd.read_csv('../csvs/delta_pos.csv', index_col = 'Unnamed: 0')
sw_pos = pd.read_csv('../csvs/sw_pos.csv', index_col = 'Unnamed: 0')
united_pos = pd.read_csv('../csvs/united_pos.csv', index_col = 'Unnamed: 0')
dfs_pos = [aa_pos, delta_pos, sw_pos, united_pos]

In [165]:
for df in dfs_pos:
    print(df.shape)

(1757, 12)
(2036, 12)
(2014, 12)
(1347, 12)


#### Create sentiment column

In [167]:
for df in dfs_pos:
    mask = df['compound'] <= -0.4
    df.loc[mask, 'negative_sent'] = 1.0
    mask = df['compound'] >= 0.7
    df.loc[mask, 'positive_sent'] = 1.0
    mask = (df['compound'] > -0.4) & (df['compound'] < 0.7)
    df.loc[mask, 'neutral_sent'] = 1.0
for df in dfs_neg:
    mask = df['compound'] <= -0.4
    df.loc[mask, 'negative_sent'] = 1.0
    mask = df['compound'] >= 0.7
    df.loc[mask, 'positive_sent'] = 1.0
    mask = (df['compound'] > -0.4) & (df['compound'] < 0.7)
    df.loc[mask, 'neutral_sent'] = 1.0

In [168]:
def label_sentiment(row):
    if row['neutral_sent'] == 1 :
        return 'Neutral'
    if row['positive_sent'] == 1 :
        return 'Positive'
    if row['negative_sent'] == 1 :
        return 'Negative'

In [169]:
for df in dfs_pos:
    df['sentiment'] = df.apply(label_sentiment, axis=1)
for df in dfs_neg:
    df['sentiment'] = df.apply(label_sentiment, axis=1)

In [170]:
cols_to_keep = ['Date', 'time', 'handle', 'tweet', 'tweets_clean1', 'hashtags', 'sentiment', 
                'United', 'Delta', 'Southwest', 'American']
for df in dfs_pos:
    df = df[cols_to_keep]
for df in dfs_neg:
    df = df[cols_to_keep]

In [171]:
for df in dfs_neg:
    print(df.shape)
for df in dfs_pos:
    print(df.shape)

(4070, 16)
(2871, 16)
(2036, 16)
(2012, 16)
(1757, 16)
(2036, 16)
(2014, 16)
(1347, 16)


In [181]:
df_whole = pd.concat([aa_neg, delta_neg, sw_neg, united_neg, aa_pos, delta_pos, sw_pos, united_pos])
df_whole['All Airlines'] = 1.0
df_whole.shape

(18143, 17)

In [None]:
unique = df_whole.loc[df_whole.index.value_counts() == 1].index.values
# sorted(unique)

In [252]:
df_whole.loc[unique].shape

(17354, 17)

In [255]:
df_whole1 = df_whole.loc[unique].copy()

In [256]:
df_whole1.rename(columns={'American': 'American Airlines', 
                         'Delta': 'Delta Airlines',
                         'Southwest': 'Southwest Airlines',
                         'United': 'United Airlines'}, inplace=True)

In [260]:
df_whole1.to_csv('../csvs/df_whole1.csv')

In [261]:
df_whole1 = pd.read_csv('../csvs/df_whole1.csv', index_col = 'Unnamed: 0')
df_whole1.shape

(17354, 17)

In [156]:
stop = STOP_WORDS.union({'flight', 'amp', 'im', 'fly', 'flights'})
airline_stopwords = {'United': {'united'}, 
                  'Delta': {'delta', 'dea'},
                  'Southwest' : {'southwest'},
                  'American' : {'american', 'aa'}, 
                   }
all_stop = stop.union({'united', 'delta', 'dea', 'southwest', 'american', 
                       'aa', 'flight', 'amp', 'im', 'fly',
                      'good', 'thanks', 'great', 'thank', 'best', 
                       'like', 'awesome', 'today', 'airlines', 'better'})

In [157]:
def get_clusters_tfidf(df, col, airline, true_k = 5, n_components = 20, verbose = True, random_state = 24):
#     df = df[df[airline] == 1] # convert the df to one with a single airline
    vect = TfidfVectorizer(tokenizer = None,
                               max_df = 0.5, 
                               max_features = 200000, 
                               stop_words = all_stop,
                               use_idf = True, 
                               ngram_range=(1, 3),
                               min_df = 3)
    matrix = vect.fit_transform(df[col])
    grams = vect.get_feature_names()
    
    # do svd
    svd = TruncatedSVD(n_components = n_components, random_state = random_state)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    reduced_matrix = lsa.fit_transform(matrix)
    
    # do kmeans
    km = KMeans(n_clusters = true_k, random_state = 24)
    km.fit(reduced_matrix)
    clusters = km.labels_.tolist()
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    
    if verbose:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
        print('\n', airline)
        for i in range(true_k):
            print("\nCluster %d:" % i)
            print(', '.join([grams[ind] for ind in order_centroids[i, :25]]))
    return km

In [264]:
km_all = get_clusters_tfidf(df = df_whole, 
                 col = 'tweets_clean1', 
                 airline = 'All Airlines',
                 n_components = 50, 
                 true_k = 10, 
                 random_state = 42)


 All Airlines

Cluster 0:
crew, nice, amazing, sure, right, new, yes, job, work, fun, morning, pretty, lol, trip, service, happy, day, getting, cool, hey, safe, super, hours, way, beautiful

Cluster 1:
service, customer, customer service, worst customer, worst customer service, worst, horrible, poor, disappointed, terrible, rude, bad, poor customer, poor customer service, amazing, terrible customer, new, travel, terrible customer service, horrible customer, horrible customer service, agent, excellent, seats, phone

Cluster 2:
love, love love, love flying, flying, new, love guys, guys, time, nice, people, travel, work, fun, free, passengers, love new, got, way, love field, field, friends, know, enjoy, lol, favorite

Cluster 3:
help, free, wifi, need, free wifi, hey, sure, nice, support, want, need help, airport, drink, appreciate, drinks, trying, lol, ticket, change, free drinks, check, bags, upgrade, free drink, phone

Cluster 4:
time, time flying, flying, waste, plane, airline, waste

In [265]:
pd.Series(km_all.labels_).value_counts()

9    4919
8    2843
0    2240
6    2142
7    1076
1    1066
3     976
5     719
4     713
2     660
dtype: int64

In [288]:
df_whole['cluster_num1'] = km_all.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [289]:
df_whole.to_csv('../csvs/df_final.csv')

In [290]:
df_whole.head()

Unnamed: 0,Date,time,handle,tweet,compound,American Airlines,Delta Airlines,Southwest Airlines,United Airlines,tweets_clean1,hashtags,cluster_num,negative_sent,positive_sent,neutral_sent,sentiment,All Airlines,cluter_num1,cluster_num1
33303,2017-11-04,04:34:20,jdub_316,@AmericanAir The worse part is I know I’ll nev...,-0.4404,1.0,0.0,0.0,0.0,worse part know i will never get response apology,,2,1.0,,,Negative,1.0,9,9
33304,2017-11-04,04:33:16,jdub_316,@AmericanAir I Live in DFW and am prisoner to ...,-0.5423,1.0,0.0,0.0,0.0,live dfw prisoner incredibly poorly run minute...,['#aaishorrible'],2,1.0,,,Negative,1.0,0,0
33313,2017-11-04,04:25:54,ToddBrown0323,@AmericanAir Still here! Waiting for a gate a...,-0.7562,1.0,0.0,0.0,0.0,still waiting gate dfw flight wake hell,,2,1.0,,,Negative,1.0,6,6
33321,2017-11-04,04:11:18,liz_depauw,.@AmericanAir at the rate customer service is ...,-0.7351,1.0,0.0,0.0,0.0,american air rate customer service moving migh...,,3,1.0,,,Negative,1.0,1,1
33328,2017-11-04,03:54:24,cmullark,@AmericanAir So we deplane no one is there. Fl...,-0.6594,1.0,0.0,0.0,0.0,deplane one flight attendant gets someonego g2...,,1,1.0,,,Negative,1.0,0,0
