In [1]:
# utilities
import re
import pickle
import numpy as np
import pandas as pd


from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Importing the dataset
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
                      encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

# Removing the unnecessary columns.
dataset2 = dataset[['sentiment','text']]
# Replacing the values to ease understanding.
dataset2['sentiment'] = dataset['sentiment'].replace(4,1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset2['sentiment'] = dataset['sentiment'].replace(4,1)


In [5]:
dataset2["sentiment"]

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: sentiment, Length: 1600000, dtype: int64

In [6]:
dataset2.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [37]:
import random
sample_list = random.sample(range(0, 1600000), 20000)

In [38]:
id_ = 0
for i in range(10):
    id_ += 1
    if id_ > 10:
        break
    print(dataset2['sentiment'][i], dataset2['text'][i])

0 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
0 is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
0 @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
0 my whole body feels itchy and like its on fire 
0 @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
0 @Kwesidei not the whole crew 
0 Need a hug 
0 @LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?
0 @Tatiana_K nope they didn't have it 
0 @twittera que me muera ? 


In [39]:
id_ = 0
X = []
y = []
for i in sample_list:
    y.append(dataset2['sentiment'][i])
    X.append(dataset2['text'][i])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1, random_state = 0)

In [48]:
X_test = X_test[:1000]
y_test = y_test[:1000]

In [49]:
np.save("X_train_text.npy", X_train)
np.save("X_test_text.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

In [50]:
import spacy 
nlp = spacy.load('en_core_web_md')

In [43]:
import numpy as np
from tqdm import tqdm


X_train_num = np.array([nlp(i).vector for i in tqdm(X_train)])



  0%|                                                 | 0/18000 [00:00<?, ?it/s][A
  0%|                                         | 6/18000 [00:00<05:29, 54.68it/s][A
  0%|                                        | 14/18000 [00:00<04:29, 66.79it/s][A
  0%|                                        | 21/18000 [00:00<04:51, 61.68it/s][A
  0%|                                        | 29/18000 [00:00<04:27, 67.26it/s][A
  0%|                                        | 37/18000 [00:00<04:13, 70.78it/s][A
  0%|                                        | 46/18000 [00:00<04:02, 74.09it/s][A
  0%|                                        | 54/18000 [00:00<04:12, 71.02it/s][A
  0%|▏                                       | 62/18000 [00:00<04:23, 68.01it/s][A
  0%|▏                                       | 69/18000 [00:01<04:24, 67.67it/s][A
  0%|▏                                       | 76/18000 [00:01<04:26, 67.21it/s][A
  0%|▏                                       | 83/18000 [00:01<04:34, 65.22

  7%|██▋                                   | 1249/18000 [00:23<03:56, 70.79it/s][A
  7%|██▋                                   | 1257/18000 [00:23<03:55, 71.23it/s][A
  7%|██▋                                   | 1265/18000 [00:24<03:49, 72.92it/s][A
  7%|██▋                                   | 1273/18000 [00:24<03:52, 72.05it/s][A
  7%|██▋                                   | 1281/18000 [00:24<03:45, 74.11it/s][A
  7%|██▋                                   | 1290/18000 [00:24<03:39, 76.12it/s][A
  7%|██▋                                   | 1299/18000 [00:24<03:30, 79.37it/s][A
  7%|██▊                                   | 1307/18000 [00:24<03:50, 72.44it/s][A
  7%|██▊                                   | 1315/18000 [00:24<04:09, 66.77it/s][A
  7%|██▊                                   | 1322/18000 [00:24<04:11, 66.42it/s][A
  7%|██▊                                   | 1329/18000 [00:24<04:07, 67.27it/s][A
  7%|██▊                                   | 1336/18000 [00:25<04:18, 64.39i

 18%|██████▋                               | 3184/18000 [00:45<03:03, 80.65it/s][A
 18%|██████▋                               | 3193/18000 [00:45<03:32, 69.73it/s][A
 18%|██████▊                               | 3201/18000 [00:46<03:52, 63.65it/s][A
 18%|██████▊                               | 3208/18000 [00:46<03:47, 64.96it/s][A
 18%|██████▊                               | 3215/18000 [00:46<03:48, 64.70it/s][A
 18%|██████▊                               | 3222/18000 [00:46<04:01, 61.15it/s][A
 18%|██████▊                               | 3229/18000 [00:46<03:58, 61.93it/s][A
 18%|██████▊                               | 3236/18000 [00:46<04:07, 59.56it/s][A
 18%|██████▊                               | 3243/18000 [00:46<04:17, 57.31it/s][A
 18%|██████▊                               | 3250/18000 [00:46<04:05, 59.96it/s][A
 18%|██████▉                               | 3257/18000 [00:46<04:11, 58.55it/s][A
 18%|██████▉                               | 3263/18000 [00:47<04:47, 51.32i

 25%|█████████▋                            | 4562/18000 [01:10<03:52, 57.77it/s][A
 25%|█████████▋                            | 4569/18000 [01:10<03:41, 60.57it/s][A
 25%|█████████▋                            | 4576/18000 [01:10<03:35, 62.32it/s][A
 25%|█████████▋                            | 4584/18000 [01:10<03:21, 66.66it/s][A
 26%|█████████▋                            | 4593/18000 [01:10<03:06, 71.86it/s][A
 26%|█████████▋                            | 4602/18000 [01:10<02:54, 76.81it/s][A
 26%|█████████▋                            | 4610/18000 [01:10<03:04, 72.70it/s][A
 26%|█████████▋                            | 4618/18000 [01:10<02:59, 74.48it/s][A
 26%|█████████▊                            | 4626/18000 [01:10<03:02, 73.16it/s][A
 26%|█████████▊                            | 4634/18000 [01:11<03:01, 73.81it/s][A
 26%|█████████▊                            | 4642/18000 [01:11<02:58, 74.94it/s][A
 26%|█████████▊                            | 4650/18000 [01:11<02:57, 75.34i

 35%|█████████████▏                        | 6227/18000 [01:31<02:56, 66.63it/s][A
 35%|█████████████▏                        | 6234/18000 [01:32<02:54, 67.41it/s][A
 35%|█████████████▏                        | 6242/18000 [01:32<02:50, 68.77it/s][A
 35%|█████████████▏                        | 6249/18000 [01:32<03:04, 63.63it/s][A
 35%|█████████████▏                        | 6257/18000 [01:32<02:59, 65.59it/s][A
 35%|█████████████▏                        | 6264/18000 [01:32<02:59, 65.49it/s][A
 35%|█████████████▏                        | 6271/18000 [01:32<03:02, 64.33it/s][A
 35%|█████████████▎                        | 6278/18000 [01:32<03:03, 63.99it/s][A
 35%|█████████████▎                        | 6285/18000 [01:32<03:09, 61.92it/s][A
 35%|█████████████▎                        | 6292/18000 [01:32<03:02, 64.10it/s][A
 35%|█████████████▎                        | 6299/18000 [01:33<02:58, 65.52it/s][A
 35%|█████████████▎                        | 6306/18000 [01:33<03:09, 61.84i

 43%|████████████████▏                     | 7693/18000 [01:55<03:51, 44.46it/s][A
 43%|████████████████▎                     | 7699/18000 [01:55<03:45, 45.65it/s][A
 43%|████████████████▎                     | 7704/18000 [01:55<03:49, 44.83it/s][A
 43%|████████████████▎                     | 7709/18000 [01:55<03:57, 43.33it/s][A
 43%|████████████████▎                     | 7715/18000 [01:55<03:43, 46.01it/s][A
 43%|████████████████▎                     | 7721/18000 [01:56<03:57, 43.23it/s][A
 43%|████████████████▎                     | 7726/18000 [01:56<03:54, 43.80it/s][A
 43%|████████████████▎                     | 7731/18000 [01:56<03:47, 45.22it/s][A
 43%|████████████████▎                     | 7738/18000 [01:56<03:21, 50.88it/s][A
 43%|████████████████▎                     | 7745/18000 [01:56<03:05, 55.26it/s][A
 43%|████████████████▎                     | 7751/18000 [01:56<03:16, 52.05it/s][A
 43%|████████████████▍                     | 7758/18000 [01:56<03:04, 55.45i

 51%|███████████████████▍                  | 9182/18000 [02:18<02:16, 64.47it/s][A
 51%|███████████████████▍                  | 9189/18000 [02:18<02:18, 63.61it/s][A
 51%|███████████████████▍                  | 9197/18000 [02:18<02:10, 67.32it/s][A
 51%|███████████████████▍                  | 9204/18000 [02:18<02:28, 59.11it/s][A
 51%|███████████████████▍                  | 9211/18000 [02:18<02:38, 55.61it/s][A
 51%|███████████████████▍                  | 9217/18000 [02:19<02:45, 53.06it/s][A
 51%|███████████████████▍                  | 9223/18000 [02:19<02:43, 53.68it/s][A
 51%|███████████████████▍                  | 9231/18000 [02:19<02:27, 59.43it/s][A
 51%|███████████████████▌                  | 9239/18000 [02:19<02:18, 63.24it/s][A
 51%|███████████████████▌                  | 9246/18000 [02:19<02:24, 60.62it/s][A
 51%|███████████████████▌                  | 9253/18000 [02:19<02:36, 55.77it/s][A
 51%|███████████████████▌                  | 9259/18000 [02:19<02:34, 56.47i

 59%|█████████████████████▉               | 10675/18000 [02:41<01:49, 66.99it/s][A
  0%|                                                  | 0/9000 [09:40<?, ?it/s][A

 59%|█████████████████████▉               | 10689/18000 [02:48<37:45,  3.23it/s][A
 59%|█████████████████████▉               | 10694/18000 [02:48<29:58,  4.06it/s][A
 59%|█████████████████████▉               | 10699/18000 [02:48<23:27,  5.19it/s][A
 59%|██████████████████████               | 10705/18000 [02:49<17:10,  7.08it/s][A
 60%|██████████████████████               | 10711/18000 [02:49<12:48,  9.48it/s][A
 60%|██████████████████████               | 10716/18000 [02:49<10:34, 11.48it/s][A
 60%|██████████████████████               | 10721/18000 [02:49<08:26, 14.37it/s][A
 60%|██████████████████████               | 10728/18000 [02:49<06:19, 19.15it/s][A
 60%|██████████████████████               | 10733/18000 [02:49<05:28, 22.13it/s][A
 60%|██████████████████████               | 10739/18000 [02:49<04:25, 27.33

 69%|█████████████████████████▎           | 12336/18000 [03:10<01:09, 81.28it/s][A
 69%|█████████████████████████▍           | 12345/18000 [03:10<01:08, 81.98it/s][A
 69%|█████████████████████████▍           | 12355/18000 [03:11<01:06, 84.95it/s][A
 69%|█████████████████████████▍           | 12364/18000 [03:11<01:06, 84.58it/s][A
 69%|█████████████████████████▍           | 12373/18000 [03:11<01:07, 83.37it/s][A
 69%|█████████████████████████▍           | 12382/18000 [03:11<01:08, 82.33it/s][A
 69%|█████████████████████████▍           | 12391/18000 [03:11<01:09, 80.55it/s][A
 69%|█████████████████████████▍           | 12400/18000 [03:11<01:10, 79.51it/s][A
 69%|█████████████████████████▌           | 12409/18000 [03:11<01:08, 81.28it/s][A
 69%|█████████████████████████▌           | 12418/18000 [03:11<01:10, 79.26it/s][A
 69%|█████████████████████████▌           | 12426/18000 [03:12<01:13, 75.92it/s][A
 69%|█████████████████████████▌           | 12438/18000 [03:12<01:04, 86.74i

 77%|████████████████████████████▎        | 13772/18000 [03:33<01:17, 54.75it/s][A
 77%|████████████████████████████▎        | 13779/18000 [03:33<01:13, 57.54it/s][A
 77%|████████████████████████████▎        | 13785/18000 [03:33<01:12, 57.94it/s][A
 77%|████████████████████████████▎        | 13791/18000 [03:33<01:16, 55.37it/s][A
 77%|████████████████████████████▎        | 13797/18000 [03:34<01:18, 53.21it/s][A
 77%|████████████████████████████▎        | 13803/18000 [03:34<01:19, 52.77it/s][A
 77%|████████████████████████████▍        | 13809/18000 [03:34<01:21, 51.73it/s][A
 77%|████████████████████████████▍        | 13815/18000 [03:34<01:20, 51.70it/s][A
 77%|████████████████████████████▍        | 13822/18000 [03:34<01:13, 56.46it/s][A
 77%|████████████████████████████▍        | 13828/18000 [03:34<01:13, 56.50it/s][A
 77%|████████████████████████████▍        | 13836/18000 [03:34<01:06, 62.52it/s][A
 77%|████████████████████████████▍        | 13843/18000 [03:34<01:06, 62.06i

 87%|████████████████████████████████     | 15623/18000 [03:55<00:29, 79.76it/s][A
 87%|████████████████████████████████▏    | 15632/18000 [03:55<00:31, 75.06it/s][A
 87%|████████████████████████████████▏    | 15640/18000 [03:55<00:33, 71.37it/s][A
 87%|████████████████████████████████▏    | 15648/18000 [03:56<00:34, 68.62it/s][A
 87%|████████████████████████████████▏    | 15657/18000 [03:56<00:32, 72.76it/s][A
 87%|████████████████████████████████▏    | 15665/18000 [03:56<00:32, 71.73it/s][A
 87%|████████████████████████████████▏    | 15674/18000 [03:56<00:31, 74.29it/s][A
 87%|████████████████████████████████▏    | 15684/18000 [03:56<00:29, 78.20it/s][A
 87%|████████████████████████████████▎    | 15694/18000 [03:56<00:27, 82.94it/s][A
 87%|████████████████████████████████▎    | 15704/18000 [03:56<00:26, 85.45it/s][A
 87%|████████████████████████████████▎    | 15714/18000 [03:56<00:25, 88.53it/s][A
 87%|████████████████████████████████▎    | 15725/18000 [03:56<00:24, 92.70i

 96%|███████████████████████████████████▍ | 17250/18000 [04:17<00:09, 75.13it/s][A
 96%|███████████████████████████████████▍ | 17258/18000 [04:17<00:09, 74.94it/s][A
 96%|███████████████████████████████████▍ | 17267/18000 [04:17<00:09, 76.77it/s][A
 96%|███████████████████████████████████▌ | 17275/18000 [04:18<00:09, 76.54it/s][A
 96%|███████████████████████████████████▌ | 17283/18000 [04:18<00:09, 76.39it/s][A
 96%|███████████████████████████████████▌ | 17292/18000 [04:18<00:08, 80.26it/s][A
 96%|███████████████████████████████████▌ | 17301/18000 [04:18<00:08, 78.14it/s][A
 96%|███████████████████████████████████▌ | 17310/18000 [04:18<00:08, 80.56it/s][A
 96%|███████████████████████████████████▌ | 17319/18000 [04:18<00:08, 82.83it/s][A
 96%|███████████████████████████████████▌ | 17328/18000 [04:18<00:07, 84.18it/s][A
 96%|███████████████████████████████████▋ | 17337/18000 [04:18<00:07, 84.71it/s][A
 96%|███████████████████████████████████▋ | 17346/18000 [04:18<00:07, 84.43i

In [51]:
X_test_num = np.array([nlp(i).vector for i in tqdm(X_test)])

100%|███████████████████████████████████████| 1000/1000 [00:11<00:00, 87.73it/s]


In [53]:
np.sum(y_test)

505

In [54]:
np.save("X_train.npy", X_train_num)
np.save("X_test.npy", X_test_num)

In [55]:
X_train_num.shape

(18000, 300)