In [34]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [35]:
df = pd.read_csv('comments.csv')

In [36]:
df.shape

(17885, 2)

In [37]:
df.sample(5)

Unnamed: 0,body,subreddit
415,If two guys are discussing the geofinancial an...,rockets
15323,76ers vs Nuggets\n\nNew era.,nba
5551,u/dr-wong u/sirjackiechiles this hoes body wou...,rockets
8343,Idk if I wanna watch. So disappointed,nba
6282,"now, this is the type of content i like to see...",rockets


### Target

In [38]:
df['target'] = df['subreddit'].map({'nba': 0, 'rockets': 1})
df.drop('subreddit', axis=1, inplace=True)
df.sample(5)

Unnamed: 0,body,target
7250,#MORTAL KOMBAT!!!,1
8591,He played fewer mins last season than in 2016,0
17452,"Double-edged sword, wait until Marcus Smart dr...",0
14629,Harden,0
16640,Never forget this pearler - https://youtu.be/5...,0


### Duplicates

In [39]:
# no na values in dataframe

df.isna().sum()

body      0
target    0
dtype: int64

In [40]:
# checking how many duplicate comments there are
df.duplicated().sum()

1336

In [41]:
# most of the duplicate comments are comments that were removed 
# duplicate comments were ether removed comments or comments that were posted twice with same text 
df[df.duplicated()].head(10)

Unnamed: 0,body,target
13,[removed],1
41,Says the dumbass,1
63,[removed],1
72,I had to unfollow Lebron on everything. I thi...,1
73,I had to unfollow Lebron on everything. I thi...,1
83,[removed],1
84,[removed],1
92,[removed],1
96,[removed],1
97,[removed],1


In [42]:
#drop rows where there are duplicates
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)

In [43]:
df.shape

(16549, 2)

In [44]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [45]:
df.shape

(16549, 2)

### Clean

In [46]:
df['body'][2]

"The change with China is going to happen when its economic bubble eventually bursts.\n\nThis is probably the best short breakdown of the Chinese cultural psyche that I've seen online:\n\nhttps://www.resetera.com/threads/rockets-gm-daryl-morey-tweets-in-support-of-hong-kong-protest-rockets-organization-denounces-tweet-update-nba-suspended-in-china.145209/page-13#post-25281281"

In [47]:
from bs4 import BeautifulSoup             

# Initialize the BeautifulSoup object on a single movie review     
example1 = BeautifulSoup(df['body'][2])

# Print the raw review and then the output of get_text(), for 
# comparison
print(df['body'][2])
print()
print(example1.get_text())

The change with China is going to happen when its economic bubble eventually bursts.

This is probably the best short breakdown of the Chinese cultural psyche that I've seen online:

https://www.resetera.com/threads/rockets-gm-daryl-morey-tweets-in-support-of-hong-kong-protest-rockets-organization-denounces-tweet-update-nba-suspended-in-china.145209/page-13#post-25281281

The change with China is going to happen when its economic bubble eventually bursts.

This is probably the best short breakdown of the Chinese cultural psyche that I've seen online:

https://www.resetera.com/threads/rockets-gm-daryl-morey-tweets-in-support-of-hong-kong-protest-rockets-organization-denounces-tweet-update-nba-suspended-in-china.145209/page-13#post-25281281


In [48]:
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text())   # The text to search

In [49]:
# Convert letters_only to lower case.
lower_case = letters_only.lower()

# Split lower_case up at each space.
words = lower_case.split() # This is like a manual tokenizer!

In [50]:
# Check first ten words.
words[0:10]

['the',
 'change',
 'with',
 'china',
 'is',
 'going',
 'to',
 'happen',
 'when',
 'its']

In [1]:
# Funtion adapted from Lecture 

def review_to_words(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML.
    review_text = BeautifulSoup(text).get_text()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove non-letters. (Punctuation)
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove whitespaces
    text = re.sub(r'\s\s+', ' ', text)
     
    return text

In [52]:
df['body'] = df['body'].apply(review_to_words)

In [53]:
df.shape

(16549, 2)

In [54]:
df['body'][2]

'the change with china going happen when its economic bubble eventually bursts this probably the best short breakdown the chinese cultural psyche that seen online https www resetera com threads rockets daryl morey tweets support hong kong protest rockets organization denounces tweet update nba suspended china page post '

### Lemmatize

In [55]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words   

In [56]:
df['body'] = df['body'].apply(lemmatize_words)

In [57]:
df.shape

(16549, 2)

In [58]:
df.to_csv('clean_comments.csv', index=False)

### Find most common words from each subreddit

In [59]:
# Get most frequently used words, can change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 35) 

# input for CountVectorizer is an array of strings
vector_input_rockets = df[df['target'] == 1]['body']

# fit_transform the vectorizer
rockets_words = count_vect.fit_transform(vector_input_rockets)

# convert output to a Numpy array
rockets_words = rockets_words.toarray()

In [60]:
# rockets most common words
rockets_word_list = count_vect.get_feature_names()
print(rockets_word_list)

['china', 'chinese', 'don', 'fan', 'fuck', 'game', 'going', 'good', 'guy', 'ha', 'harden', 'just', 'know', 'like', 'lol', 'look', 'make', 'morey', 'nba', 'people', 'player', 'really', 'right', 'rocket', 'rus', 'say', 'season', 'team', 'thing', 'think', 'time', 'wa', 'want', 'way', 'year']


In [61]:
# Get most frequently used words, can change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 35) 

# input for CountVectorizer is an array of strings
vector_input_nba = df[df['target'] == 0]['body']

# fit_transform the vectorizer
nba_words = count_vect.fit_transform(vector_input_nba)

# convert output to a Numpy array
nba_words = nba_words.toarray()

In [62]:
# nba most common words
nba_word_list = count_vect.get_feature_names()
print(nba_word_list)

['best', 'better', 'china', 'doesn', 'don', 'fan', 'game', 'going', 'good', 'got', 'guy', 'ha', 'just', 'know', 'lebron', 'like', 'lol', 'make', 'nba', 'people', 'play', 'player', 'point', 'really', 'right', 'say', 'season', 'shit', 'team', 'thing', 'think', 'time', 'wa', 'way', 'year']


### Add the most common words that are in both subreddits to stopwords 'english'

In [63]:
# Function to find elements that are the same in two lists 

def common_words(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        print(a_set & b_set) 
    else: 
        print("No common elements")  

In [64]:
common_words(rockets_word_list, nba_word_list)

{'player', 'just', 'game', 'don', 'like', 'thing', 'know', 'right', 'people', 'year', 'season', 'guy', 'say', 'going', 'really', 'think', 'lol', 'time', 'way', 'make', 'team', 'nba', 'ha', 'good', 'china', 'fan', 'wa'}


In [65]:
union_words = ['game', 'make', 'guy', 'lol', 'going', 'time', 'player', 'season', 'think', 'wa', 'china', 'like', 'team', 'thing', 'year', 'nba', 'don', 'really', 'fan', 'people', 'know', 'say', 'way', 'ha', 'right', 'good', 'just']

In [66]:
# extending stopwords.words('english') to include words that are common in both subreddits

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(union_words)

In [67]:
%store stopwords

Stored 'stopwords' (list)
