In [2]:
import pymongo
import pickle
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np
import csv

import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [3]:
with open('tweets_emoji_words.pkl', 'rb') as f:
    tweets_emoji_words = pickle.load(f)

In [4]:
len(tweets_emoji_words)

901873

In [5]:
tweet_text = tweets_emoji_words

In [None]:
# remove non-English from text
#tweet_text = [char.encode('ascii', errors='ignore') for char in tweet_text]

In [6]:
len(tweet_text)

901873

In [7]:
# get gender
gender = np.load('gender.npy')

In [8]:
len(gender)

901873

In [None]:
tweets_gender_df = pd.DataFrame(
    {'text': tweet_text,
     'gender': gender
    })

In [None]:
fem_text = tweets_gender_df[tweets_gender_df['gender'] == 'female']['text']

In [None]:
male_text = tweets_gender_df[tweets_gender_df['gender'] == 'male']['text']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    tweet_text, gender, test_size=0.33, random_state=42)

In [10]:
y = gender

In [None]:
# split the training data into 'M' and 'F' 
X_tr_M = X_train[y_train=='male']

X_tr_F = X_train[y_train=='female']

What are the top 25 'words' for female and for male?

In [None]:
from collections import Counter

In [None]:
stop_words = text.ENGLISH_STOP_WORDS

In [None]:
# in: pandas series of strings

def series_to_word_counts(series):

    word_counts = Counter(' '.join(series.astype('str').values.tolist()).split())
    word_counts = list(word_counts.items())
    word_counts = [(i.lower(), j) for i, j in word_counts]
    word_counts.sort(key=lambda x: x[1], reverse=True)
    
    return word_counts
# out: sorted list of word counts (lower case)

In [None]:
# in: sorted list of word counts

def top_25_word_counts(word_counts):
    total_counter = 0
    non_stop_counter = 0
    word_counts_list = []

    while non_stop_counter < 25:
        if word_counts[total_counter][0] not in stop_words:
            word_counts_list.append(word_counts[total_counter])
            non_stop_counter += 1
        total_counter += 1
        
    return word_counts_list
# out: top 25 words + counts

In [None]:
fem_word_counts = series_to_word_counts(fem_text)

fem_word_list = top_25_word_counts(fem_word_counts)

top_25_fem = [i for i,j in fem_word_list]

top_25_fem

In [None]:
male_word_counts = series_to_word_counts(male_text)

male_word_list = top_25_word_counts(male_word_counts)

top_25_male = [i for i,j in male_word_list]

top_25_male

In [None]:
# get original emojis from emoji dict and replace in list

# open emoji_dict 
with open('emoji_dict.pickle', 'rb') as handle:
    emoji_dict = pickle.load(handle)

In [None]:
# reverse the dict
emoji_word_dict = { v:k for k,v in emoji_dict.items() }

In [None]:
def replace_emoji_words(words):
    for word in words:
        word = emoji_word_dict[word]

In [None]:
top_25_fem = [emoji_word_dict.get(item,item)  for item in top_25_fem]

In [None]:
top_25_fem

In [None]:
top_25_male = [emoji_word_dict.get(item,item)  for item in top_25_male]

In [None]:
top_25_male

Count Vectorize:

In [None]:
cv = CountVectorizer(stop_words='english', min_df=10)

In [None]:
cv.fit(X_train)

In [None]:
X_cv = cv.transform(X_train)

In [None]:
X_cv.shape

Make a pipeline: Tfidf --> SVD --> Log Reg, Random Forest

In [None]:
from sklearn.pipeline import Pipeline

vec_pipe = Pipeline([('tfidf', TfidfVectorizer(min_df = 3, stop_words = 'english')),
                     ('svd', TruncatedSVD(n_components=200)),
                     ('lr', LogisticRegression()),
                    ])

In [None]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

Tf-idf:

In [11]:
tfidf = TfidfVectorizer(min_df = 5, stop_words = 'english')

In [12]:
X_tr_tf = tfidf.fit_transform(X_train)

In [19]:
X_tr_tf

<604254x56301 sparse matrix of type '<class 'numpy.float64'>'
	with 3416978 stored elements in Compressed Sparse Row format>

In [14]:
X_test_tf = tfidf.transform(X_test)

In [15]:
features_tf = tfidf.get_feature_names()

In [None]:
features_tf

In [None]:
X_tr_tf.shape

In [16]:
SVD = TruncatedSVD(n_components=2)
LSA_tr = SVD.fit_transform(X_tr_tf)

In [None]:
SVD.components_

In [17]:
LSA_test = SVD.transform(X_test_tf)

In [None]:
component_names = ["component_"+str(i+1) for i in range(2)]

LSA_df = pd.DataFrame(SVD.components_,
                                         index = component_names,
                                         columns = tfidf.get_feature_names()).T

In [None]:
fem_top_20_feats = LSA_df.sort('component_1', ascending=False)[:20]

In [None]:
m_top_20_feats = LSA_df.sort('component_2', ascending=False)[:20]

In [None]:
def replace_emoji_words(item):
    return emoji_word_dict.get(item,item)

In [None]:
fem_top_20_feats.index = fem_top_20_feats.index.map(replace_emoji_words)

In [None]:
fem_top_20_feats.iloc[:,:1]

In [None]:
m_top_20_feats.index = m_top_20_feats.index.map(replace_emoji_words)

In [1]:
m_top_20_feats.iloc[:,1:2]

NameError: name 'm_top_20_feats' is not defined

In [18]:
document_term_matrix = pd.DataFrame(X_tr_tf.toarray(),
                                        index=X_train,
                                        columns=tfidf.get_feature_names ())

MemoryError: 

In [None]:
document_term_matrix.head()

Classification Models:

In [None]:
param_dict = {
    'lr':{
        'C':np.logspace(-3,3,7)
    },
    'rf':{}
}

In [None]:
model_dict = {
    'lr':GridSearchCV(LogisticRegression(),
                             param_grid=param_dict['lr'],
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'rf':GridSearchCV(RandomForestClassifier(),
                      param_grid=param_dict['rf'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
}

In [None]:
def fit_all_models(x,y, model_dict):
    for model in model_dict.keys():
        model_dict[model].fit(x,y)
        print("{:5} best score: {}".format(model, model_dict[model].best_score_))

In [None]:
fit_all_models(LSA_tr, y_train, model_dict)

In [None]:
lr = GridSearchCV(LogisticRegression(),
                             param_grid=param_dict['lr'],
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42))

In [None]:
lr.fit(LSA_tr, y_train)

In [None]:
lr.cv_results_

In [None]:
rf = GridSearchCV(RandomForestClassifier(),
                      param_grid=param_dict['rf'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42))

In [None]:
rf.fit(LSA_tr, y_train)

In [None]:
rf.cv_results_

In [None]:
results = pd.DataFrame(rf.cv_results_)
results

Predict gender on test set:

In [None]:
predicted_gender_lr = model_dict[0].predict(LSA_test)

In [None]:
predicted_gender_rf = model_dict[1].predict(LSA_test)

In [None]:
dfs = top_feats_by_class(X_tf, y, features)

In [None]:
plot_tfidf_classfeats_h(dfs)

From Friday afternoon with Josh and Shehreen:

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
tfd_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=6))
])

In [None]:
tfd_pipe.fit(cleantweets)

In [None]:
tfd = tfd_pipe.steps[0][1]
svd = tfd_pipe.steps[1][1]

In [None]:
tfd, svd

In [None]:
tfd_word_index = tfd.get_feature_names()

In [None]:
svd.components_.shape

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def svd_variance(svd_model, col_index):
    
    dimensions = ['Dimension {}'.format(i) for i in range(1,len(svd_model.components_)+1)]
    components = pd.DataFrame(np.round(svd_model.components_, 4), columns=col_index)
    ratios = svd_model.explained_variance_ratio_.reshape(len(svd_model.components_), 1)
    variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
    variance_ratios.index = dimensions

    fig, ax = plt.subplots(figsize = (24,10))

    # Plot the feature weights as a function of the components
    components.plot(ax = ax, kind = 'bar',legend='False');
    ax.set_ylabel("Feature Weights")
    ax.set_xticklabels(dimensions, rotation=0)
    ax.legend_.remove()


    # Display the explained variance ratios
    for i, ev in enumerate(svd_model.explained_variance_ratio_):
        ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))

In [None]:
svd_variance(svd, tfd_word_index)

In [None]:
svd

In [None]:
[word for word
in sorted(zip(tfd_word_index, svd.components_[0]), key = lambda x: x[1], reverse=True)
         if abs(word[1]) > .3]

In [None]:
[word for word
in sorted(zip(tfd_word_index, svd.components_[1]), key = lambda x: x[1], reverse=True)
         if abs(word[1]) > .3]