In [1]:
# Neural Network Classifier with Scikit
import pandas as pd, numpy as np, json, re, pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, auc, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

In [2]:
# read controversy data

con_df = pd.read_json(r'C:\Users\Gabe\Documents\Bellevue University\Data Mining\Week 9 & 10\categorized-comments.jsonl', lines=True)

# check size, structure and categories
con_df.head()
print('Size: ', len(con_df), '\n',
      'Shape: ', con_df.info(), '\n',
      'Categories: ', con_df['cat'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606476 entries, 0 to 606475
Data columns (total 2 columns):
cat    606476 non-null object
txt    606476 non-null object
dtypes: object(2)
memory usage: 9.3+ MB
Size:  606476 
 Shape:  None 
 Categories:  video_games               435542
sports                    145823
science_and_technology     25111
Name: cat, dtype: int64


In [3]:
def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text 
    Output: text
    """
    
    text=text.lower()
    text=re.sub('</?.*?>',' <>', text)
    text=re.sub('\\d|\\W+|_',' ',text)
    text=re.sub('[^a-zA-Z]'," ", text)
    
    return text
# Create stop words list

stop_words = stopwords.words('english')

In [4]:
size = 30000    # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

controversy = con_df.groupby('cat', as_index=False).apply(fn)

# free up memory

del con_df

controversy['txt'] = controversy['txt'].apply(lambda x:clean_text(x))
controversy.reset_index(drop=True, inplace=True)

controversy.head()

Unnamed: 0,cat,txt
0,science_and_technology,gt no reselling of phones just chiming in to ...
1,science_and_technology,just curious any specific reason why i mean i ...
2,science_and_technology,of course they won t tell you but at the same ...
3,science_and_technology,you re not missing much v owner
4,science_and_technology,nice looking phone


In [5]:
def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    """
    terrible: 0.0 < y <= 3.0
    okay: 3.0 < y <= 5.0
    great: 5.0 < y <= 7.0
    amazing: 7.0 < y < 10.1
    """
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])


def train_model(path, model, continuous=True, saveto=None, cv=12):
    # Load the corpus dat and labels for classification
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    if continuous:
        y = continuous(corpus)
        scoring = 'r2_score'
    else:
        y = make_categorical(corpus)
        scoring = 'f1_score'
        
    # Compute corss-validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    # Write to disk if specified
    #if saveto:
    #    joblib.dump(model, saveto)
        
    # Fit the model on entire dataset
    model.fit(X, y)
    
    # Return scores
    return scores

In [6]:
#from transformer import TextNormalizer
#from reader import PickledReviewsReader
from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction.text import TfidffVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score


In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(controversy['txt']).toarray()
labels = controversy['cat']
features.shape

(90000, 15581)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


X_train, X_test, y_train, y_test = train_test_split(controversy['txt'], controversy['cat'], random_state = 0)


count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [9]:
print(clf.predict(count_vect.transform(["Thanks again Cleveland and Houston "])))

['sports']


In [10]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train, y_train)

In [11]:
comments_test = controversy.sample(n=100)
y_pred = text_clf.predict(X_train)
print('Accuracy: ' + str(np.mean(y_pred == y_train) * 100))

Accuracy: 83.48148148148148


In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_train, y_pred))

                        precision    recall  f1-score   support

science_and_technology       0.81      0.90      0.85     22589
                sports       0.89      0.80      0.84     22462
           video_games       0.81      0.80      0.81     22449

              accuracy                           0.83     67500
             macro avg       0.84      0.83      0.83     67500
          weighted avg       0.84      0.83      0.83     67500



In [16]:
print(confusion_matrix(y_train, y_pred))

[[20338   638  1613]
 [ 1966 18003  2493]
 [ 2769  1671 18009]]
