In [1]:
%pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 4.3 MB/s 
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 38.8 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 43.2 MB/s 
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 50.6 MB/s 
Co

In [2]:
import pandas as pd
import spacy
from sklearn.model_selection  import train_test_split
from spacy.tokens import DocBin

In [3]:
import spacy.cli

spacy.cli.download("en_core_web_md") 

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
posts_df = pd.read_csv('./posts_scores_dates.csv')
posts_df.head()

Unnamed: 0,id,title,created_utc,comments_old,score_old,post_time_utc,score,comments,percentile,viral
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1,2022-05-20 16:03:57,69,7.0,0.83673,0
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1,2022-05-20 15:59:59,37,4.0,0.699491,0
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1,2022-05-20 15:57:19,1,0.0,0.117647,0
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1,2022-05-20 15:48:11,16,2.0,0.443585,0
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1,2022-05-20 15:38:20,1,0.0,0.117647,0


In [5]:
nlp = spacy.load('en_core_web_md')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f1c9f4182f0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f1c9f4181a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f1c9f2a9ad0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f1c9f1b9960>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f1c9f1c4e60>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f1c9f2a9d50>)]

Try using SPACY as a tokenizer in sklearn countvectorizer

In [6]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, SCORERS, mean_squared_error, accuracy_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline, Pipeline

In [7]:
X = posts_df['title']
y = posts_df['viral']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [8]:
def get_metrics(function):
    print(f'The training balanced accuracy score is {balanced_accuracy_score(y_train, function.predict(X_train))}')
    print(f'The testing balanced accuracy score is {balanced_accuracy_score(y_test, function.predict(X_test))}')
    print(f'The training f1 score is {f1_score(y_train, function.predict(X_train))}')
    print(f'The testing f1 score is {f1_score(y_test, function.predict(X_test))}')
    print(f'The training accuracy score is {accuracy_score(y_train, function.predict(X_train))}')
    print(f'The testing accuracy score is {accuracy_score(y_test, function.predict(X_test))}')

In [9]:
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000, random_state=20, class_weight='balanced'))
])

pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('lr',
                 LogisticRegression(class_weight='balanced', max_iter=1000,
                                    random_state=20))])

In [10]:
get_metrics(pipe_lr)

The training balanced accuracy score is 0.9614296525875738
The testing balanced accuracy score is 0.5971292279642902
The training f1 score is 0.410958904109589
The testing f1 score is 0.0984848484848485
The training accuracy score is 0.9415760869565217
The testing accuracy score is 0.911819192293442


Create a tokenizer to use spacy tokenization

In [11]:
def spacy_tokenizer(doc):
  return [x.orth_ for x in nlp(doc)]

Test out the spacy tokenizer to make sure it works

In [12]:
cvs = CountVectorizer(tokenizer=spacy_tokenizer)

In [13]:
X_train_cvs = cvs.fit_transform(X_train)

In [14]:
pd.DataFrame(X_train_cvs.A, columns = cvs.get_feature_names_out()).head()

Unnamed: 0,sponge,!,"""",#,$,%,&,','d,'ll,...,🥸,🦀,🦄,🧁,🧅,🧇,🧑,🫐,🫠,🫶🏼🫑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now that the tokenizer is working, let's apply it to the best performing model from the other notebook

In [15]:
pipe_ros = Pipeline([
    ('cv', CountVectorizer()),
  ('ros', RandomOverSampler()),
    ('rfc', RandomForestClassifier())
])

In [16]:
params_ros  = {
    'cv__ngram_range': [(1,2)],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'cv__tokenizer' :[None, spacy_tokenizer],
    'ros__sampling_strategy':[0.0, .25, .5, .75, 1],
    'rfc__n_jobs':[-1],
}

In [None]:
gs_ros = GridSearchCV(
    pipe_ros,
    param_grid= params_ros,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

gs_ros.fit(X_train, y_train)



In [None]:
gs_ros.best_estimator_

In [None]:
get_metrics(gs_ros)

The Random Oversampling and spacy tokenizer model times out on Google Colab and therefore could not complete fitting.