In [36]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

In [37]:
data = pd.read_pickle("cleaned_tweets.pkl")
data.head()

Unnamed: 0,label,cleaned_tweets_incl_SW,cleaned_tweets_SW_removed,cleaned_tweets_SW_removed_len_gt2
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i am wired i know i am george i wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to me about a question i have unless i pay them for their stupid support,amazing service apple talk question unless pay stupid support


### Word Embeddings

In [38]:
tweets_list = list(data['cleaned_tweets_SW_removed'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [39]:
# Creating your own Word2Vec Model & Train
from gensim.models import Word2Vec
# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=5, sg=0)

In [40]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=2440, vector_size=300, alpha=0.025>


In [41]:
cbow_model.wv.index_to_key[:20]

['iphone',
 'apple',
 'i',
 'my',
 'the',
 'to',
 'a',
 'is',
 'samsung',
 'it',
 'and',
 'you',
 'new',
 'twitter',
 'for',
 'com',
 'phone',
 'me',
 'sony',
 'not']

In [42]:
len(cbow_model.wv.index_to_key)

2440

In [43]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [44]:
tweets_temp = data['cleaned_tweets_SW_removed'].apply(document_vector)

In [45]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [0.0399835, 0.33625486, -0.02500192, 0.05551723, -0.02032441, -0.38807315, 0.15851666, 0.52045566, -0.124208905, 0.030367225, -0.03986065, -0.23179263, -0.04339204, -0.12538682, -0.1452291, -0.152...
1    [0.013774453, 0.20576945, 0.033048704, 0.055075806, -0.012772152, -0.31885657, 0.23940869, 0.39691025, -0.009999564, -0.17538038, -0.029512888, -0.2500132, 0.011852333, 0.08854894, -0.17654714, -0...
2    [0.033355672, 0.15187229, 0.085422955, 0.124750316, 0.009809439, -0.20016758, 0.2016915, 0.4177847, 0.099185616, -0.112650305, -0.014595959, -0.23172669, 0.013462442, 0.041303895, -0.15793206, -0....
3    [0.08745195, 0.13059999, 0.14677054, 0.18138403, 0.013174023, -0.17348404, 0.2313592, 0.46672985, 0.16958979, -0.1478469, 0.0079227565, -0.2903385, 0.03146164, 0.10342252, -0.18295807, 0.005532910...
4    [0.057071388, 0.124738224, 0.10685496, 0.15387082, 0.012065107, -0.18567508, 0.22289039, 0.4319276, 0.1582634, -0.17510347, 0.000743964, -0.26172107, 0.013467538, 0.1046397, -

In [46]:
tweets_temp[0].shape  # each document vecotr is 300-dimensional !!

(300,)

In [47]:
type(tweets_temp)

pandas.core.series.Series

In [48]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(7920, 300)

In [49]:
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = data['label']
df.dropna(how='any', axis=0, inplace=True)

In [50]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.039983,0.336255,-0.025002,0.055517,-0.020324,-0.388073,0.158517,0.520456,-0.124209,0.030367,...,0.215388,0.201467,-0.033585,0.260728,0.252156,0.082151,-0.109638,0.117947,0.006738,0
1,0.013774,0.205769,0.033049,0.055076,-0.012772,-0.318857,0.239409,0.39691,-0.01,-0.17538,...,0.263607,0.244832,-0.086145,0.204839,0.318631,-0.051588,-0.244891,0.097621,-0.084559,0
2,0.033356,0.151872,0.085423,0.12475,0.009809,-0.200168,0.201691,0.417785,0.099186,-0.11265,...,0.236323,0.21712,-0.017093,0.200741,0.249649,-7.9e-05,-0.116789,0.077153,-0.078748,0
3,0.087452,0.1306,0.146771,0.181384,0.013174,-0.173484,0.231359,0.46673,0.16959,-0.147847,...,0.273778,0.271048,0.005677,0.243268,0.244216,0.011721,-0.099318,0.087006,-0.078905,0
4,0.057071,0.124738,0.106855,0.153871,0.012065,-0.185675,0.22289,0.431928,0.158263,-0.175103,...,0.269,0.261812,-0.019221,0.216414,0.284855,-0.01799,-0.132647,0.068499,-0.106739,1


In [51]:
df.shape

(7920, 301)

In [52]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape

(7920, 300)

In [53]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2))

85.61 0.29
85.34 0.89


In [55]:
X = data['cleaned_tweets_SW_removed']
y = data['label']

# we want to include only those words in the vocab which have min df of 5,
# means select only those words which occur ATLEAST in 5 documents!! 
# AND SELECT the TOP 300 FEATURES ONLY to build the model
CV = CountVectorizer(min_df=5, max_features=300)

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )
results = cross_validate(CV_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

CV.fit_transform(X)
len(CV.vocabulary_)  # no. of features AFTER applying the stopwords

88.96 0.1
87.78 0.94


300

### Word Embeddings from GloVe Model

In [56]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [57]:
# load the converted model
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

FileNotFoundError: [Errno 2] No such file or directory: 'word2vec.txt'

In [None]:
model.get_vector('analytics')

NameError: name 'model' is not defined

In [None]:
# model.index_to_key

In [None]:
def document_vector_GloVe(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(model.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

In [None]:
tweets_temp = data['cleaned_tweets_SW_removed'].apply(document_vector_GloVe)


NameError: name 'model' is not defined

In [None]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

0    [0.0076073036, 0.36563912, -0.017829334, 0.13810664, 0.002734739, -0.37158328, 0.12625904, 0.5248367, -0.17825066, 0.01851082, -0.06740156, -0.16306688, -0.059494246, -0.062954575, -0.16027455, -0...
1    [0.0158105, 0.20234677, 0.06099586, 0.07994256, 0.007336532, -0.29250187, 0.23279652, 0.36627945, 0.010631448, -0.14495794, 0.006304062, -0.2803884, -0.015781898, 0.10213733, -0.15599898, -0.24977...
2    [0.0400173, 0.13644567, 0.10470885, 0.11657105, 0.02667524, -0.18809159, 0.21864848, 0.38404962, 0.124423295, -0.09046478, 0.02517883, -0.26747122, -0.0029446005, 0.039209347, -0.15143086, -0.0922...
3    [0.09637585, 0.1071649, 0.15986662, 0.16335377, 0.019727113, -0.17757939, 0.27743855, 0.41078225, 0.19750166, -0.11732262, 0.06421344, -0.34289852, -0.006108156, 0.08522984, -0.17811987, -0.076994...
4    [0.0717861, 0.09557765, 0.12791885, 0.12585501, 0.024032945, -0.17412983, 0.25427285, 0.37239587, 0.19065553, -0.13509518, 0.05637974, -0.32273, -0.011662999, 0.082977675, -0.

In [None]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 100
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

# tweets_vec.shape # this itself is your final FEATURE MATRIX
# Create a new DF to store these new documnent features
df1 = pd.DataFrame(tweets_vec)
df1['y'] = data['label']
df1.dropna(how='any', axis=0, inplace=True)

X_word_emb = df1.drop('y', axis=1)
y = df1['y']
X_word_emb.shape

ValueError: could not broadcast input array from shape (300,) into shape (100,)

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2))

85.58 0.26
85.27 0.49
