# DAY 1

In [1]:
import os
import json
import pandas as pd
import numpy as np
import gensim.models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping

### Uploaded the 3 CSVs

In [2]:
path_reddit = r'C:\Users\henon\code\Stock_market\STOCK_PREDICT\data\RedditNews.csv'
path_news = r'C:\Users\henon\code\Stock_market\STOCK_PREDICT\data\Combined_News_DJIA.csv'
path_djia = r'C:\Users\henon\code\Stock_market\STOCK_PREDICT\data\upload_DJIA_table.csv'

In [3]:
reddit = pd.read_csv(path_reddit)
news = pd.read_csv(path_news)
djia = pd.read_csv(path_djia)

### Merge Dataset

In [4]:
news['Date'] = pd.to_datetime(news['Date'])
djia['Date'] = pd.to_datetime(djia['Date'])
df = news.merge(djia)

In [52]:
df.shape

(1988, 37)

### Sort out Y

In [53]:
# get percentage change
df['change'] = df['Open'].pct_change()

In [54]:
df['change'] = df['change'].shift(-1)

In [55]:
# Get categorical data
def categorical(x):
    if x > 0:
        x = 1
    else:
        x = 0
    return x

In [56]:
df['target'] = df['change'].apply(categorical)

In [57]:
# Combine the top 25 daily news into 1 column
cols = df.columns[2:]
df['combined'] = df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [59]:
df.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Open,High,Low,Close,Volume,Adj Close,change,target,combined,cleaned
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.004436,1,b'Why wont America and Nato help us? If they w...,wont America Nato help u If wont help u help I...
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727,-0.012637,0,b'Remember that adorable 9-year-old who sang a...,adorable sang opening ceremony That fake Georg...
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,11632.80957,11633.780273,11453.339844,11532.959961,182550000,11532.959961,-0.00866,0,b' U.S. refuses Israel weapons to attack Iran:...,b refuse Israel weapon attack Iran president o...


### Data Cleaning

In [9]:
from sklearn.preprocessing import FunctionTransformer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import string
import re

In [10]:
def clean(text):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for punctuation in punctuation:
        review1 = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    without_b=text.replace(" b ","")
    without_b=text.replace("b'","")
    without_b=text.replace('b"',"")
    tokenized = word_tokenize(without_b) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return " ".join(lemmatized)
df['cleaned'] = df['combined'].apply(clean)

In [15]:
df['cleaned'][3]

'b refuse Israel weapon attack Iran president ordered attack Tskhinvali capital South Ossetia knew doomed How come realize Israel clear troop killed Reuters policy tough drug pointless say former civil servant ran unit year old found trunk Latest ransom paid kidnapping victim Mexico Head cop quits Prez dissolve suspect elite task moved million quake survivor prefab announces Operation Get All Up In Russia Grill Yeah end well force sink Georgian ship commander Navy air reconnaissance squadron provides President defense secretary airborne ability command nation nuclear weapon relieved duty CNN reader Russia action Georgia justified send fleet Black Sea help Georgia send troop humanitarian aid exercise warns Israeli plan strike Iran nuclear facility intriguing cyberalliance two Estonian computer expert heading Georgia keep country network running amid intense military confrontation Russia CNN Effect Georgia Schools Russia Information Russias response Georgia extinct humanitarian mission s

In [63]:
df.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Open,High,Low,Close,Volume,Adj Close,change,target,combined,cleaned
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.004436,1,b'Why wont America and Nato help us? If they w...,wont America Nato help u If wont help u help I...
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727,-0.012637,0,b'Remember that adorable 9-year-old who sang a...,adorable sang opening ceremony That fake Georg...
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,11632.80957,11633.780273,11453.339844,11532.959961,182550000,11532.959961,-0.00866,0,b' U.S. refuses Israel weapons to attack Iran:...,b refuse Israel weapon attack Iran president o...


In [135]:
import re
if re.search(r'^b\s', text):
        text = text.replace("b ","")

TypeError: expected string or bytes-like object

### Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(df.clean_text)

Some of the reviews in the dataset are too short to be considered for training. Others are too long. 

Keep only the reviews that are between 100 and 500 words.

In [122]:
def word_count(string):
    tokens = string.split()
    n_tokens = len(tokens)
    return n_tokens

df['count'] = df['cleaned'].apply(word_count)
df = df[(df['count'] > 100) & (df['count'] < 500)]

len(df) #useless cause no review with less than 100 words

1988

## Vectorizer tuning

In [123]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())])

parameters = {'vectorizer__ngram_range': [(1, 1),(2,2)],
              'vectorizer__max_df':[0.33,0.5,0.75],
              'vectorizer__min_df':[0.05,0.1],
              'vectorizer__max_features' : [25,50,100],
              "nb__alpha":[0.1,0.5,1]}

gridsearch = GridSearchCV(pipeline, parameters, cv=5, scoring="accuracy")

gridsearch.fit(df.cleaned, df.Label)

print( "Best score:", gridsearch.best_score_)

Best score: 0.5301823981367797


In [124]:
print( "Best parameters:", gridsearch.best_params_)  

Best parameters: {'nb__alpha': 0.1, 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 25, 'vectorizer__min_df': 0.1, 'vectorizer__ngram_range': (1, 1)}


# DAY 2

In [50]:
import gensim
from gensim.test.utils import datapath
from gensim import utils

In [51]:
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [19]:
#Tokenize the cleaned column
from gensim.utils import tokenize
sentance = [list(tokenize(s, deacc=True, lower=True)) for s in df['cleaned']]

In [22]:
df['cleaned'][1]

'wont America Nato help u If wont help u help Iraq put foot Georgian Georgian minister Thanks Israeli training fending Russia army flees disarray Russians advance Gori abandoned Russia without shot opening ceremony firework Mossad fraudulent New Zealand Passports Iraq angered Israeli military sale American citizen living blame Georgian leader genocide innocent To World War IV Now In High Definition move mistake monumental proportion press deeper Georgia say regime change Bindra win first ever Individual Olympic Gold Medal ship head Arctic define Jerusalem taxi station threaten quit rather work new bos French Team Stunned Phelps Relay US behind Georgian aggression Do believe TV neither Russian Georgian There much victim still going Montreal Canada police murdered boy Saturday overtake US largest South Ossetia PICS Physicians Group Condemns State Russia beaten United States head Peak question Georgia Russia conflict much better come trading sex food'

In [20]:
len(sentance)

1988

In [21]:
sentance[1]

['adorable',
 'sang',
 'opening',
 'ceremony',
 'that',
 'fake',
 'georgia',
 'operation',
 'if',
 'sexual',
 'harassment',
 'would',
 'child',
 'losing',
 'support',
 'iraq',
 'brutal',
 'crackdown',
 'activity',
 'regard',
 'including',
 'woman',
 'buying',
 'cucumber',
 'georgia',
 'putin',
 'outmaneuvers',
 'microsoft',
 'intel',
 'tried',
 'kill',
 'xo',
 'the',
 'war',
 'balance',
 'power',
 'trying',
 'get',
 'sense',
 'this',
 'whole',
 'war',
 'vote',
 'up',
 'if',
 'you',
 'think',
 'georgia',
 'started',
 'it',
 'or',
 'down',
 'if',
 'think',
 'russia',
 'did',
 'us',
 'military',
 'surprised',
 'timing',
 'swiftness',
 'russian',
 'military',
 'move',
 'south',
 'ossetia',
 'still',
 'trying',
 'sort',
 'happened',
 'us',
 'defense',
 'official',
 'said',
 'monday',
 'beats',
 'war',
 'drum',
 'iran',
 'dumps',
 'georgian',
 'military',
 'attacked',
 'south',
 'ossetian',
 'capital',
 'tskhinvali',
 'multiple',
 'rocket',
 'launcher',
 'designed',
 'devastate',
 'large',
 

In [23]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in sentance:
            # assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

In [24]:
import gensim.models
from gensim.models import word2vec

model = gensim.models.Word2Vec(sentences=corpus)

In [25]:
print(model.wv.vocab)



In [26]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embeddings = []
    for word in l:
        if word in model.wv:
            embeddings.append(model.wv[word])
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

In [27]:
mean_vectors = []
for line in sentance:
    mean_vec = compute_mean_embedding(line)
    if mean_vec.shape != (100,):
        mean_vec= np.zeros((100,))
    mean_vectors.append(mean_vec)
mean_vectors = np.array(mean_vectors)
print(mean_vectors)

[[ 5.3531483e-05 -4.0379940e-03  6.7571037e-02 ... -3.4014739e-03
   3.4141240e-01  4.6203643e-01]
 [-2.8661577e-02  2.9765695e-04  9.4799809e-02 ... -2.3518534e-02
   3.4481561e-01  4.7990686e-01]
 [-3.2467535e-03 -3.9034352e-02  4.4547621e-02 ... -8.3060965e-02
   3.3611798e-01  4.0858021e-01]
 ...
 [ 3.2304373e-02 -2.0952314e-02  1.1159199e-01 ...  8.6210612e-03
   3.1434247e-01  4.6406960e-01]
 [ 7.1759395e-02 -3.0337464e-02  1.4530420e-01 ... -1.2696328e-02
   2.7835840e-01  4.6353331e-01]
 [ 8.0864467e-02 -1.2855265e-02  1.6154502e-01 ...  7.2128056e-03
   2.6306781e-01  4.4236791e-01]]


# Approach with Train Test Split

## Train Validation Split

In [31]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

## Tokenization

In [32]:
from nltk import word_tokenize
import string

X_train_tokenized = []

for review in X_train:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_train_tokenized.append(['<START>'] + word_tokenize(review_modified))

In [33]:
X_val_tokenized = []

for review in X_val:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_val_tokenized.append(['<START>'] + word_tokenize(review_modified))

## Rushing to a baseline with Word2Vec and mean computing

### Training the Word2Vec model

In [34]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_train_tokenized:
            # Assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

# Iterate !

In [36]:
import gensim.models
from gensim.models import word2vec

model = gensim.models.Word2Vec(sentences=corpus)

### Compute Mean Embeddings

In [37]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embedded_l = np.array([model.wv[word] for word in l if word in model.wv.vocab.keys()])
    return embedded_l.mean(axis = 0)

In [38]:
X_train_mean_vectors = [compute_mean_embedding(review) for review in X_train_tokenized]

X_train_mean_vectors = np.array(X_train_mean_vectors)

X_train_mean_vectors.shape

(1391, 100)

In [39]:
X_val_mean_vectors = [compute_mean_embedding(review) for review in X_val_tokenized]

X_val_mean_vectors = np.array(X_val_mean_vectors)

X_val_mean_vectors.shape

(597, 100)

### Running a few classifiers as a baseline

In [41]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_log = OneVsRestClassifier(LogisticRegression(max_iter = 200))

ovr_log.fit(X_train_mean_vectors, y_train)

print(ovr_log.score(X_val_mean_vectors, y_val))

0.5527638190954773
Wall time: 43 ms


In [42]:
%%time

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train_mean_vectors, y_train)

print(decision_tree.score(X_val_mean_vectors, y_val))

0.5309882747068677
Wall time: 247 ms


In [43]:
%%time

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train_mean_vectors, y_train)

print(random_forest.score(X_val_mean_vectors, y_val))

0.5527638190954773
Wall time: 926 ms


In [61]:
%%time

from xgboost import XGBClassifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_mean_vectors, y_train)
print(xgboost_model.score(X_val_mean_vectors, y_val))

0.5293132328308208
Wall time: 3.13 s


# Approach with specific train/test

In [73]:
#Train data: 2008-08-08 to 2014-12-31
train_2 = df[(df['Date'] > '2008-08-08') & (df['Date'] <= '2014-12-31')]
X_train_2 = train_2['cleaned']
y_train_2 = train_2['Label']

#Test data: 2015-01-02 to 2016-07-01
test_2 = df[(df['Date'] > '2015-01-02') & (df['Date'] <= '2016-07-01')]
X_val_2 = test_2['cleaned']
y_val_2 = test_2['Label']

### Tokenization

In [74]:
from nltk import word_tokenize
import string

X_train_2_tokenized = []

for review in X_train_2:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_train_2_tokenized.append(['<START>'] + word_tokenize(review_modified))

In [75]:
X_val_2_tokenized = []

for review in X_val_2:
    review_modified = ''.join([character for character in review if character not in string.punctuation])
    review_modified = ''.join([character for character in review_modified if not character.isdigit()])
    review_modified = review_modified.lower()
    X_val_2_tokenized.append(['<START>'] + word_tokenize(review_modified))

### Training the Word2Vec model

In [76]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_train_2_tokenized:
            # Assume there's one document per line, tokens separated by whitespace
            yield line
            
corpus = MyCorpus()

# Iterate !

In [77]:
import gensim.models
from gensim.models import word2vec

model_2 = gensim.models.Word2Vec(sentences=corpus)

In [78]:
def compute_mean_embedding(l):
    """
    l: list of words
    """
    embedded_l_2 = np.array([model_2.wv[word] for word in l if word in model_2.wv.vocab.keys()])
    return embedded_l_2.mean(axis = 0)

In [79]:
X_train_2_mean_vectors = [compute_mean_embedding(review) for review in X_train_2_tokenized]

X_train_2_mean_vectors = np.array(X_train_2_mean_vectors)

X_train_2_mean_vectors.shape

(1610, 100)

In [80]:
X_val_2_mean_vectors = [compute_mean_embedding(review) for review in X_val_2_tokenized]

X_val_2_mean_vectors = np.array(X_val_2_mean_vectors)

X_val_2_mean_vectors.shape

(377, 100)

### Running a few classifiers as a baseline

In [81]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_log = OneVsRestClassifier(LogisticRegression(max_iter = 200))

ovr_log.fit(X_train_2_mean_vectors, y_train_2)

print(ovr_log.score(X_val_2_mean_vectors, y_val_2))

0.506631299734748
Wall time: 41 ms


In [82]:
%%time

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train_2_mean_vectors, y_train_2)

print(decision_tree.score(X_val_2_mean_vectors, y_val_2))

0.5013262599469496
Wall time: 171 ms


In [83]:
%%time

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train_2_mean_vectors, y_train_2)

print(random_forest.score(X_val_2_mean_vectors, y_val_2))

0.47214854111405835
Wall time: 1.11 s


In [84]:
%%time

from xgboost import XGBClassifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_2_mean_vectors, y_train_2)
print(xgboost_model.score(X_val_2_mean_vectors, y_val_2))

0.5013262599469496
Wall time: 851 ms


## Tpot

In [85]:
# install dependencies
!pip install deap update_checker tqdm stopit joblib torch
# install xgboost optionally
!pip install xgboost
# install tpot
!pip install tpot

Collecting deap

  ERROR: Command errored out with exit status 1:
   command: 'c:\users\henon\.venvs\lewagon\scripts\python.exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\henon\\AppData\\Local\\Temp\\pip-install-uuq36juy\\torch\\setup.py'"'"'; __file__='"'"'C:\\Users\\henon\\AppData\\Local\\Temp\\pip-install-uuq36juy\\torch\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\henon\AppData\Local\Temp\pip-wheel-4lk2ba61'
       cwd: C:\Users\henon\AppData\Local\Temp\pip-install-uuq36juy\torch\
  Complete output (30 lines):
  running bdist_wheel
  running build
  running build_deps
  Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "C:\Users\henon\AppData\Local\Temp\pip-install-uuq36juy\torch\setup.py", line 265, in <module>
      description="Tensors and Dynamic neural networks in Python with s


  Downloading deap-1.3.1-cp37-cp37m-win_amd64.whl (108 kB)
Collecting update_checker
  Downloading update_checker-0.17-py2.py3-none-any.whl (7.0 kB)
Collecting stopit
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting torch
  Downloading torch-0.1.2.post2.tar.gz (128 kB)
Building wheels for collected packages: stopit, torch
  Building wheel for stopit (setup.py): started
  Building wheel for stopit (setup.py): finished with status 'done'
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11959 sha256=b18553dd671c615d011c207d7b81570e6d6d47d050bc7d11d3d0d928abe482c0
  Stored in directory: c:\users\henon\appdata\local\pip\cache\wheels\e2\d2\79\eaf81edb391e27c87f51b8ef901ecc85a5363dc96b8b8d71e3
  Building wheel for torch (setup.py): started
  Building wheel for torch (setup.py): finished with status 'error'
  Running setup.py clean for torch
Successfully built stopit
Failed to build torch
Installing collected packages: deap, update-checker, stopit, torch
    Running

In [86]:
from sklearn.model_selection import train_test_split

X = df['cleaned']
y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [91]:
X

1       wont America Nato help u If wont help u help I...
2       adorable sang opening ceremony That fake Georg...
3       b refuse Israel weapon attack Iran president o...
4       expert admit legalise drug South Osetia pictur...
5       Mom missing gay man Too bad cheerleader still ...
                              ...                        
1984    Barclays RBS share suspended trading tanking s...
1985    Scientists To Australia If You Want To Save Th...
1986    Explosion At Airport In former president Terro...
1987    Jamaica proposes marijuana dispenser tourist a...
1988    A woman Mexico City finally received birth cer...
Name: cleaned, Length: 1988, dtype: object

In [92]:
y

1       1
2       0
3       0
4       1
5       1
       ..
1984    0
1985    1
1986    1
1987    1
1988    1
Name: Label, Length: 1988, dtype: int64

In [93]:
from sklearn.model_selection import train_test_split
import os
from tpot import TPOTClassifier

# instanciate TPOTClassifier
tpot = TPOTClassifier(config='TPOT text', generations=5, population_size=50, verbosity=2)
# process autoML with TPOT
tpot.fit(X_train, y_train)
# print score
print(tpot.score(X_val, y_val))

TypeError: __init__() got an unexpected keyword argument 'config'