# DAY 1

In [1]:
import os
import json
import pandas as pd
import numpy as np
import gensim.models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping

### Uploaded the 3 CSVs

In [5]:
### Uploaded the 3 CSVs
nrows=1989
USERNAME='clairemalbrel'
path1=f'/Users/{USERNAME}/code/{USERNAME}/STOCK_PREDICT/data/datasets-129-792900-upload_DJIA_table.csv'
path2=f'/Users/{USERNAME}/code/{USERNAME}/STOCK_PREDICT/data/Combined_News_DJIA.csv'
djia = pd.read_csv(path1, nrows=nrows)
news = pd.read_csv(path2,nrows=nrows)
   

### Merge Dataset

In [6]:
news['Date'] = pd.to_datetime(news['Date'])
djia['Date'] = pd.to_datetime(djia['Date'])
df = news.merge(djia)

In [7]:
df.shape

(1989, 33)

### Sort out Y

In [8]:
# get percentage change
df['change'] = df['Open'].pct_change()

In [9]:
df['change'] = df['change'].shift(-1)

In [10]:
# Get categorical data
def categorical(x):
    if x > 0:
        x = 1
    else:
        x = 0
    return x

In [11]:
df['target'] = df['change'].apply(categorical)

In [12]:
# Combine the top 25 daily news into 1 column
cols = df.columns[2:]
df['combined'] = df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [13]:
df.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Open,High,Low,Close,Volume,Adj Close,change,target,combined
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,"b""No Help for Mexico's Kidnapping Surge""",11432.089844,11759.959961,11388.040039,11734.320312,212830000,11734.320312,0.02603,1,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,"b""So this is what it's come to: trading sex fo...",11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.004436,1,b'Why wont America and Nato help us? If they w...
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,"b""BBC NEWS | Asia-Pacific | Extinction 'by man...",11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727,-0.012637,0,b'Remember that adorable 9-year-old who sang a...


### Data Cleaning

In [14]:
from sklearn.preprocessing import FunctionTransformer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import string
import re

In [15]:
def clean(text):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for punctuation in punctuation:
        review1 = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    without_b=text.replace(" b ","")
    without_b=text.replace("b'","")
    without_b=text.replace('b"',"")
    tokenized = word_tokenize(without_b) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return " ".join(lemmatized)
df['cleaned'] = df['combined'].apply(clean)

In [16]:
df['cleaned'][3]

'b refuse Israel weapon attack Iran president ordered attack Tskhinvali capital South Ossetia knew doomed How come realize Israel clear troop killed Reuters policy tough drug pointless say former civil servant ran unit year old found trunk Latest ransom paid kidnapping victim Mexico Head cop quits Prez dissolve suspect elite task moved million quake survivor prefab announces Operation Get All Up In Russia Grill Yeah end well force sink Georgian ship commander Navy air reconnaissance squadron provides President defense secretary airborne ability command nation nuclear weapon relieved duty CNN reader Russia action Georgia justified send fleet Black Sea help Georgia send troop humanitarian aid exercise warns Israeli plan strike Iran nuclear facility intriguing cyberalliance two Estonian computer expert heading Georgia keep country network running amid intense military confrontation Russia CNN Effect Georgia Schools Russia Information Russias response Georgia extinct humanitarian mission s

In [17]:
df.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Open,High,Low,Close,Volume,Adj Close,change,target,combined,cleaned
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,11432.089844,11759.959961,11388.040039,11734.320312,212830000,11734.320312,0.02603,1,"b""Georgia 'downs two Russian warplanes' as cou...",Georgia two Russian warplane country move brin...
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.004436,1,b'Why wont America and Nato help us? If they w...,wont America Nato help u If wont help u help I...
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727,-0.012637,0,b'Remember that adorable 9-year-old who sang a...,adorable sang opening ceremony That fake Georg...


### Bag of Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(df.clean_text)

AttributeError: 'DataFrame' object has no attribute 'clean_text'

Some of the reviews in the dataset are too short to be considered for training. Others are too long. 

Keep only the reviews that are between 100 and 500 words.

In [20]:
def word_count(string):
    tokens = string.split()
    n_tokens = len(tokens)
    return n_tokens

df['count'] = df['cleaned'].apply(word_count)
df = df[(df['count'] > 100) & (df['count'] < 500)]

len(df) #useless cause no review with less than 100 words

1989

## Vectorizer tuning

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())])

parameters = {'vectorizer__ngram_range': [(1, 1),(2,2)],
              'vectorizer__max_df':[0.33,0.5,0.75],
              'vectorizer__min_df':[0.05,0.1],
              'vectorizer__max_features' : [25,50,100],
              "nb__alpha":[0.1,0.5,1]}

gridsearch = GridSearchCV(pipeline, parameters, cv=5, scoring="accuracy")

gridsearch.fit(df.cleaned, df.Label)

print( "Best score:", gridsearch.best_score_)

Best score: 0.5309089528245763


In [22]:
print( "Best parameters:", gridsearch.best_params_)  

Best parameters: {'nb__alpha': 0.1, 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 25, 'vectorizer__min_df': 0.05, 'vectorizer__ngram_range': (1, 1)}


# DAY 2

In [23]:
import gensim
from gensim.test.utils import datapath
from gensim import utils

In [24]:
import seaborn as sns

In [25]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [26]:
#Tokenize the cleaned column
from gensim.utils import tokenize
sentance = [list(tokenize(s, deacc=True, lower=True)) for s in df['cleaned']]

In [27]:
df['cleaned'][1]

'wont America Nato help u If wont help u help Iraq put foot Georgian Georgian minister Thanks Israeli training fending Russia army flees disarray Russians advance Gori abandoned Russia without shot opening ceremony firework Mossad fraudulent New Zealand Passports Iraq angered Israeli military sale American citizen living blame Georgian leader genocide innocent To World War IV Now In High Definition move mistake monumental proportion press deeper Georgia say regime change Bindra win first ever Individual Olympic Gold Medal ship head Arctic define Jerusalem taxi station threaten quit rather work new bos French Team Stunned Phelps Relay US behind Georgian aggression Do believe TV neither Russian Georgian There much victim still going Montreal Canada police murdered boy Saturday overtake US largest South Ossetia PICS Physicians Group Condemns State Russia beaten United States head Peak question Georgia Russia conflict much better come trading sex food'

In [28]:
len(sentance)

1989

In [29]:
sentance[1]

['wont',
 'america',
 'nato',
 'help',
 'u',
 'if',
 'wont',
 'help',
 'u',
 'help',
 'iraq',
 'put',
 'foot',
 'georgian',
 'georgian',
 'minister',
 'thanks',
 'israeli',
 'training',
 'fending',
 'russia',
 'army',
 'flees',
 'disarray',
 'russians',
 'advance',
 'gori',
 'abandoned',
 'russia',
 'without',
 'shot',
 'opening',
 'ceremony',
 'firework',
 'mossad',
 'fraudulent',
 'new',
 'zealand',
 'passports',
 'iraq',
 'angered',
 'israeli',
 'military',
 'sale',
 'american',
 'citizen',
 'living',
 'blame',
 'georgian',
 'leader',
 'genocide',
 'innocent',
 'to',
 'world',
 'war',
 'iv',
 'now',
 'in',
 'high',
 'definition',
 'move',
 'mistake',
 'monumental',
 'proportion',
 'press',
 'deeper',
 'georgia',
 'say',
 'regime',
 'change',
 'bindra',
 'win',
 'first',
 'ever',
 'individual',
 'olympic',
 'gold',
 'medal',
 'ship',
 'head',
 'arctic',
 'define',
 'jerusalem',
 'taxi',
 'station',
 'threaten',
 'quit',
 'rather',
 'work',
 'new',
 'bos',
 'french',
 'team',
 'stunne

In [30]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in sentance:
            # assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

In [31]:
import gensim.models
from gensim.models import word2vec

model = gensim.models.Word2Vec(sentences=corpus)

In [32]:
print(model.wv.vocab)



In [33]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embeddings = []
    for word in l:
        if word in model.wv:
            embeddings.append(model.wv[word])
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

In [34]:
mean_vectors = []
for line in sentance:
    mean_vec = compute_mean_embedding(line)
    if mean_vec.shape != (100,):
        mean_vec= np.zeros((100,))
    mean_vectors.append(mean_vec)
mean_vectors = np.array(mean_vectors)
print(mean_vectors)

[[-0.13484736 -0.03264871  0.19346976 ... -0.6729951   0.17684774
  -0.16329962]
 [-0.0905647  -0.04321179  0.1734156  ... -0.65049595  0.19761987
  -0.21851513]
 [-0.13542368 -0.02094352  0.15793966 ... -0.6471719   0.15487382
  -0.197221  ]
 ...
 [-0.06129044 -0.07323048  0.22796226 ... -0.6006517   0.24208948
  -0.20226589]
 [-0.05143913 -0.13198087  0.25643525 ... -0.62156445  0.3146547
  -0.19236858]
 [-0.06046563 -0.13494337  0.25251043 ... -0.6014525   0.30034643
  -0.16734654]]


# Approach with Train Test Split

## Train Validation Split

In [35]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

## Tokenization

In [36]:
from nltk import word_tokenize
import string

X_train_tokenized = []

for review in X_train:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_train_tokenized.append(['<START>'] + word_tokenize(review_modified))

In [37]:
X_val_tokenized = []

for review in X_val:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_val_tokenized.append(['<START>'] + word_tokenize(review_modified))

## Rushing to a baseline with Word2Vec and mean computing

### Training the Word2Vec model

In [38]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_train_tokenized:
            # Assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

# Iterate !

In [39]:
import gensim.models
from gensim.models import word2vec

model = gensim.models.Word2Vec(sentences=corpus)

### Compute Mean Embeddings

In [40]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embedded_l = np.array([model.wv[word] for word in l if word in model.wv.vocab.keys()])
    return embedded_l.mean(axis = 0)

In [41]:
X_train_mean_vectors = [compute_mean_embedding(review) for review in X_train_tokenized]

X_train_mean_vectors = np.array(X_train_mean_vectors)

X_train_mean_vectors.shape

(1392, 100)

In [42]:
X_val_mean_vectors = [compute_mean_embedding(review) for review in X_val_tokenized]

X_val_mean_vectors = np.array(X_val_mean_vectors)

X_val_mean_vectors.shape

(597, 100)

### Running a few classifiers as a baseline

In [43]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_log = OneVsRestClassifier(LogisticRegression(max_iter = 200))

ovr_log.fit(X_train_mean_vectors, y_train)

print(ovr_log.score(X_val_mean_vectors, y_val))

0.5393634840871022
CPU times: user 48.3 ms, sys: 9.73 ms, total: 58 ms
Wall time: 50.7 ms


In [44]:
%%time

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train_mean_vectors, y_train)

print(decision_tree.score(X_val_mean_vectors, y_val))

0.5125628140703518
CPU times: user 228 ms, sys: 26.6 ms, total: 255 ms
Wall time: 220 ms


In [45]:
%%time

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train_mean_vectors, y_train)

print(random_forest.score(X_val_mean_vectors, y_val))

0.5041876046901173
CPU times: user 998 ms, sys: 16.1 ms, total: 1.01 s
Wall time: 1.08 s


In [46]:
%%time

from xgboost import XGBClassifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_mean_vectors, y_train)
print(xgboost_model.score(X_val_mean_vectors, y_val))

ModuleNotFoundError: No module named 'xgboost'

# Approach with specific train/test

In [47]:
#Train data: 2008-08-08 to 2014-12-31
train_2 = df[(df['Date'] > '2008-08-08') & (df['Date'] <= '2014-12-31')]
X_train_2 = train_2['cleaned']
y_train_2 = train_2['Label']

#Test data: 2015-01-02 to 2016-07-01
test_2 = df[(df['Date'] > '2015-01-02') & (df['Date'] <= '2016-07-01')]
X_val_2 = test_2['cleaned']
y_val_2 = test_2['Label']

### Tokenization

In [48]:
from nltk import word_tokenize
import string

X_train_2_tokenized = []

for review in X_train_2:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_train_2_tokenized.append(['<START>'] + word_tokenize(review_modified))

In [49]:
X_val_2_tokenized = []

for review in X_val_2:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_val_2_tokenized.append(['<START>'] + word_tokenize(review_modified))

### Training the Word2Vec model

In [50]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_train_2_tokenized:
            # Assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

# Iterate !

In [51]:
import gensim.models
from gensim.models import word2vec

model_2 = gensim.models.Word2Vec(sentences=corpus)

In [52]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embedded_l_2 = np.array([model_2.wv[word] for word in l if word in model_2.wv.vocab.keys()])
    return embedded_l_2.mean(axis = 0)

In [53]:
X_train_2_mean_vectors = [compute_mean_embedding(review) for review in X_train_2_tokenized]

X_train_2_mean_vectors = np.array(X_train_2_mean_vectors)

X_train_2_mean_vectors.shape

(1610, 100)

In [54]:
X_val_2_mean_vectors = [compute_mean_embedding(review) for review in X_val_2_tokenized]

X_val_2_mean_vectors = np.array(X_val_2_mean_vectors)

X_val_2_mean_vectors.shape

(377, 100)

### Running a few classifiers as a baseline

In [55]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_log = OneVsRestClassifier(LogisticRegression(max_iter = 200))

ovr_log.fit(X_train_2_mean_vectors, y_train_2)

print(ovr_log.score(X_val_2_mean_vectors, y_val_2))

0.506631299734748
CPU times: user 52.7 ms, sys: 6.68 ms, total: 59.4 ms
Wall time: 41.9 ms


In [56]:
%%time

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train_2_mean_vectors, y_train_2)

print(decision_tree.score(X_val_2_mean_vectors, y_val_2))

0.5225464190981433
CPU times: user 226 ms, sys: 14.4 ms, total: 240 ms
Wall time: 179 ms


In [57]:
%%time

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train_2_mean_vectors, y_train_2)

print(random_forest.score(X_val_2_mean_vectors, y_val_2))

0.5013262599469496
CPU times: user 1.17 s, sys: 10.1 ms, total: 1.18 s
Wall time: 1.24 s


In [58]:
%%time

from xgboost import XGBClassifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_2_mean_vectors, y_train_2)
print(xgboost_model.score(X_val_2_mean_vectors, y_val_2))

ModuleNotFoundError: No module named 'xgboost'

## Tpot

In [61]:
X

0       Georgia two Russian warplane country move brin...
1       wont America Nato help u If wont help u help I...
2       adorable sang opening ceremony That fake Georg...
3       b refuse Israel weapon attack Iran president o...
4       expert admit legalise drug South Osetia pictur...
                              ...                        
1984    Barclays RBS share suspended trading tanking s...
1985    Scientists To Australia If You Want To Save Th...
1986    Explosion At Airport In former president Terro...
1987    Jamaica proposes marijuana dispenser tourist a...
1988    A woman Mexico City finally received birth cer...
Name: cleaned, Length: 1989, dtype: object

In [62]:
y

0       0
1       1
2       0
3       0
4       1
       ..
1984    0
1985    1
1986    1
1987    1
1988    1
Name: Label, Length: 1989, dtype: int64

In [66]:
df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,High,Low,Close,Volume,Adj Close,change,target,combined,cleaned,count
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,11759.959961,11388.040039,11734.320312,212830000,11734.320312,0.02603,1,"b""Georgia 'downs two Russian warplanes' as cou...",Georgia two Russian warplane country move brin...,219
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.004436,1,b'Why wont America and Nato help us? If they w...,wont America Nato help u If wont help u help I...,143
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,11782.349609,11601.519531,11642.469727,173590000,11642.469727,-0.012637,0,b'Remember that adorable 9-year-old who sang a...,adorable sang opening ceremony That fake Georg...,189
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,11633.780273,11453.339844,11532.959961,182550000,11532.959961,-0.00866,0,b' U.S. refuses Israel weapons to attack Iran:...,b refuse Israel weapon attack Iran president o...,189
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,11718.280273,11450.889648,11615.929688,159790000,11615.929688,0.006863,1,b'All the experts admit that we should legalis...,expert admit legalise drug South Osetia pictur...,160


In [74]:
vectorizer = CountVectorizer(ngram_range=(2,2))

X_ngram = vectorizer.fit_transform(df['cleaned'])

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X_ngram, y, test_size = 0.3)

In [None]:
from sklearn.model_selection import train_test_split
import os
from tpot import TPOTClassifier

# instanciate TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=1,config_dict='TPOT sparse')
# process autoML with TPOT
tpot.fit(X_train, y_train)
# print score
print(tpot.score(X_val, y_val))

In [3]:
from sklearn.model_selection import train_test_split
import os
from tpot import TPOTClassifier

# instanciate TPOTClassifier
rescaledValidationX = scaler.transform(X_test)

tpot = TPOTClassifier(verbosity=3, scoring='accuracy', random_state=32,  n_jobs=-1, generations=5, population_size=100)

# process autoML with TPOT
tpot.fit(X_train, y_train)
# print score
print(tpot.score(rescaledValidationX, y_test))
tpot.export('tpot_stock_pipeline.py')

NameError: name 'scaler' is not defined