## Imports

In [47]:
import numpy as np
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import string 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Loading data

In [2]:
train = pd.read_json('music_reviews_train.json', lines=True)
dev = pd.read_json('music_reviews_dev.json', lines=True)
test = pd.read_json('music_reviews_test_masked.json', lines=True)

train = train[['reviewText', 'sentiment']]
dev = dev[['reviewText', 'sentiment']]
test = test[['reviewText', 'sentiment']]

train.replace(np.nan, '', regex=True, inplace=True)
dev.replace(np.nan, '', regex=True, inplace=True)
test.replace(np.nan, '', regex=True, inplace=True)

## Pre-processing

In [3]:
import string
from nltk.corpus import stopwords

def preprocess_text(text):
    puncs = string.punctuation
    stops = stopwords.words('english')
    no_punc = [token for token in text if token not in puncs]                               #remove punctuations
    no_punc = ''.join(no_punc)
    no_stops = [word.lower() for word in no_punc.split() if word.lower() not in stops]      #remove stopwords
    
    return ' '.join(no_stops)

In [4]:
train_clean = train['reviewText'].apply(preprocess_text).tolist()
dev_clean = dev['reviewText'].apply(preprocess_text).tolist()
test_clean = test['reviewText'].apply(preprocess_text).tolist()

### Changed positive and negative to 1 and 0

In [5]:
y_train = train['sentiment'].map({'positive': 1, 'negative': 0})
y_dev = dev['sentiment'].map({'positive': 1, 'negative': 0})
y_test = test['sentiment'].map({'positive': 1, 'negative': 0})

## Bag-of-words

##### Max-df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.
##### Min-df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.
##### Ngram_range: Unigrams, bigrams etc.

##### Tried changing these to see if models improved

In [118]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2,ngram_range = (1,1)).fit(train_clean) #Vocab size much smaller
bow_vectorizer
print(f'Vocab size: {len(bow_vectorizer.vocabulary_)}') 

Vocab size: 41181


#### This reduces the vocab size quite a bit (97k w/o)

In [112]:
train_bow = bow_vectorizer.transform(train_clean)
dev_bow = bow_vectorizer.transform(dev_clean)
test_bow = bow_vectorizer.transform(test_clean)
print(f'Shape of sparse matrix: {train_bow.shape}')

Shape of sparse matrix: (100000, 41181)


## TF-IDF

In [113]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(train_bow)

In [114]:
train_tfidf = tfidf_transformer.transform(train_bow)
dev_tfidf = tfidf_transformer.transform(dev_bow)
test_tfidf = tfidf_transformer.transform(test_bow)
print(train_tfidf.shape)

(100000, 41181)


## Decision-tree

In [115]:
dct = DecisionTreeClassifier(criterion='gini') #maybe try max_depth/pruning
dct.fit(train_tfidf,y_train)

DecisionTreeClassifier()

In [116]:
predictions = dct.predict(dev_tfidf)
accuracy_score(predictions, y_dev)

0.8029

## Logistic Regression

In [91]:
Log_Reg = LogisticRegression(solver='lbfgs') # lbfgs solver much quicker
Log_Reg.fit(train_tfidf,y_train)

LogisticRegression()

In [92]:
predictions = Log_Reg.predict(dev_tfidf)
accuracy_score(predictions, y_dev)

0.8946

## Naive Bayes

In [93]:
NaiveBayes = MultinomialNB().fit(train_tfidf, y_train)

In [94]:
predictions = NaiveBayes.predict(dev_tfidf)
accuracy_score(predictions, y_dev)

0.8758

## Random Forest

In [101]:
RF = RandomForestClassifier(criterion = 'entropy') # could try max_depth/pruning
RF.fit(train_tfidf,y_train)

RandomForestClassifier(criterion='entropy')

In [102]:
predictions = RF.predict(dev_tfidf)
accuracy_score(predictions, y_dev)

0.8773

### Ignore below, first attempts before copying christian bow and tfidf

In [32]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000)

In [33]:
bow_train = bow_vectorizer.fit_transform(train_clean)
bow_dev = bow_vectorizer.fit_transform(dev_clean)
bow_test = bow_vectorizer.fit_transform(test_clean)

df_bow_train = pd.DataFrame(bow_train.todense())
df_bow_dev = pd.DataFrame(bow_dev.todense())
df_bow_test = pd.DataFrame(bow_test.todense())

MemoryError: Unable to allocate 30.7 GiB for an array with shape (100000, 41181) and data type int64

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=1000)

tfidf_matrix_train = tfidf.fit_transform(train_clean)
tfidf_matrix_dev = tfidf.fit_transform(dev_clean)
tfidf_matrix_test = tfidf.fit_transform(test_clean)

df_tfidf_train = pd.DataFrame(tfidf_matrix_train.todense())
df_tfidf_dev = pd.DataFrame(tfidf_matrix_dev.todense())
df_tfidf_test = pd.DataFrame(tfidf_matrix_test.todense())

In [16]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')
Log_Reg.fit(df_bow_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()