# Sentiment analysis

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

-----

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

In [5]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [6]:
import multiprocessing as mp

In [7]:
with mp.Pool() as p:
    train["tokens"] = list(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = list(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/40000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["tokens"] = list(p.imap(tokenize, tqdm(train["text"]), chunksize=100))


  0%|          | 0/10000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["tokens"] = list(p.imap(tokenize, tqdm(test["text"]), chunksize=100))


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [9]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
sgd.fit(train["tokens"], train["sentiment"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["sentiment"], predicted))

              precision    recall  f1-score   support

         bad       0.88      0.62      0.72      2678
        good       0.87      0.97      0.92      7322

    accuracy                           0.87     10000
   macro avg       0.87      0.79      0.82     10000
weighted avg       0.87      0.87      0.87     10000

