In [640]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [951]:
#the normal stuff
import numpy as np
import pandas as pd
from collections import Counter

In [342]:
# to get JSON files from TOS API
import urllib, json
from pandas.io.json import json_normalize

In [963]:
#create a list of file names to pull each json file
from os import listdir
from os.path import isfile, join

In [784]:
# nlp imports, chose TFIDF since some words are repeated
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

In [969]:
#model imports
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Create Master DF

## Data Cleaning

All of the data for ToS;DR is stored in a directory of different json files, each labeled with a different company name. 

In [1044]:
#Pull out the data 
companies = [f for f in listdir('tosdr.org/api/1/service') if isfile(join('tosdr.org/api/1/service', f))]

#try and find non-english companies to remove, looking at every letter 
#(not just first index in case non-english word appears later)

import string
ascii_chars = set(string.printable)
nonenglish = {word for word in companies for letter in word if letter not in ascii_chars}
nonenglish.remove( 'coinbase–.json')
companies = [company for company in companies if company not in nonenglish]

In [1045]:
data_list = []
for company in companies:
    with open(f'tosdr.org/api/1/service/{company}') as json_data:
        data_list.append(json.load(json_data))

In [1047]:
#Create main dataframe, drop unnecessary columns, expand list of lists

pd_list = []
for data in data_list:
    pd_list.append(pd.DataFrame.from_dict(data['pointsData']).T.loc[:,['id','quoteText','services','title','tosdr']])

alldata_df = pd.concat(pd_list,axis=0).reset_index()
todsr_df = json_normalize(alldata_df['tosdr'])
df = pd.concat((alldata_df,todsr_df),axis=1)
df = df.drop('tosdr',axis=1).explode('services')

In [1048]:
#create master document column with tldr as base and quote text as secondar

df['document'] = df['tldr']
df['document'] = np.where(df['document'] == '',df['quoteText'],df['document'])
df['document'] = np.where(df['document'] == 'Generated through the annotate view',df['quoteText'],df['document'])
df['document'].fillna(df['quoteText'])

#if there is no text in the tldr or quoteText column, drop and reset index
df.dropna(axis=0,subset=['document'],inplace=True)
df = df.reset_index(drop=True)

In [1049]:
#create labels, the point is really to distinquish "bad" language so labeled 
#both good and neutral as a success

df['label'] = df['point']
df.label = df.label.replace(['blocker','bad','neutral','good'],['bad','bad','neutral','good'])

In [1053]:
df.iloc[2197]

index                                                          4766
id                                                             4766
quoteText         Summary</strong>\n</p>\n<p>We collect informat...
services                                                   kink-com
title             The service provides details about what kinds ...
binding                                                        True
case              The service provides details about what kinds ...
point                                                          good
privacyRelated                                                  NaN
score                                                            30
tldr              Generated through the annotate view; updated t...
sources                                                         NaN
irrelevant                                                      NaN
reason                                                          NaN
tmp_rating                                      

In [1051]:
#random lookup to check data

df.loc[np.random.randint(1,2564,5),['quoteText','tldr','document']]

Unnamed: 0,quoteText,tldr,document
2197,Summary</strong>\n</p>\n<p>We collect informat...,Generated through the annotate view; updated t...,Generated through the annotate view; updated t...
42,\n<p>Please note that some parts of our Servic...,Generated through the annotate view,\n<p>Please note that some parts of our Servic...
1569,0.\nData Retention </strong>\n</p>\n<p>We reta...,Generated through the annotate view,0.\nData Retention </strong>\n</p>\n<p>We reta...
2071,,Short and to the point.,Short and to the point.
1277,You are responsible for maintaining the confid...,Generated through the annotate view,You are responsible for maintaining the confid...


In [575]:
#the land of sad documents
df.document[0],df.document[722]

('The terms of service state that "Signal does not sell, rent or monetize your personal data or content in any way – ever."',
 'Last updated: June 07, 2019')

# First Shitty Model 
### Start taking the parts of the dataframe that I want to keep

In [1054]:
'''create a list of clean companies that can be used as stop words - use 
springcleaning function to format them in a similar way to the rest of 
tfiddy
'''
dirty_companies = [company.split('.') for company in companies]
clean_companies = [springcleaning(company[0]) for company in dirty_companies]
cleanest_companies = [company for sublist in clean_companies for company in sublist]

In [1055]:
cleanest_companies

['signal',
 'musik',
 'sammler',
 'whatismyip',
 'com',
 'email',
 'cz',
 'upcloud',
 'customink',
 'brilliant',
 'pure',
 'zenimaxmediainc',
 'qwant',
 'virtbiz',
 'any',
 'do',
 'visible',
 'sprint',
 'stackoverflow',
 'symantec',
 'gitlab',
 'airbnb',
 'websaver',
 'mcdonald',
 'kitsu',
 'meetup',
 'weebly',
 'pexgle',
 'apple',
 'web',
 'de',
 'dr',
 'mcdougall',
 'shealth',
 'medicalcenter',
 'newegg',
 'com',
 'yelp',
 'carfax',
 'jetbrains',
 'diytubevideocommunity',
 'windowslogicproductions',
 'virgin',
 'discogs',
 'digitaladvertisingplatform',
 'reklamstore',
 'cnn',
 'forbes',
 'npm',
 'w3schools',
 'etesync',
 'librarything',
 'wikimedia',
 'algolia',
 'imdb',
 'freecodecamp',
 'steam',
 'crunchyroll',
 'reputation',
 'coursehero',
 'vox',
 'osu',
 'nordvpn',
 'myspace',
 'tellonym',
 'goguardian',
 'quora',
 'lastpass',
 'mewe',
 'chilliapps',
 'abandonmentprotector',
 'ancestry',
 'pythonanywhere',
 'sonic',
 'net',
 'wikia',
 'allrecipes',
 'kongregate',
 'moddb',
 'com

In [1056]:
'''uses a method in tfidf to explore the initial clean up step - lowering 
case, removing punctuation, and splitting words'''

tfiddy = TfidfVectorizer()
tfiddy.fit(X_train)
springcleaning = tfiddy.build_analyzer()
before = df['document'][1]
after = springcleaning(df['document'][1])
print(f'Before clean up: \n {before}\n')
print(f'After clean up: \n {after}\n')

Before clean up: 
 The terms of service state: "You must be at least 13 years old to use our Services. The minimum age to use our Services without parental approval may be higher in your home country."

After clean up: 
 ['the', 'terms', 'of', 'service', 'state', 'you', 'must', 'be', 'at', 'least', '13', 'years', 'old', 'to', 'use', 'our', 'services', 'the', 'minimum', 'age', 'to', 'use', 'our', 'services', 'without', 'parental', 'approval', 'may', 'be', 'higher', 'in', 'your', 'home', 'country']



In [1057]:
X = df['document']
y = df['label']

In [1058]:
X_good = df[df['point'] == 'good']['document']
X_neutral = df[df['point'] == 'neutral']['document']
X_bad = df[df['point'] == 'bad']['document']

In [1059]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [1060]:
#MX_train, MX_test, My_train, My_test = train_test_split(X,multiy)

In [1061]:
#tfiddy = TfidfVectorizer()
tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(X_train)
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

In [1062]:
#countvoncount the countvectorizer
voncount = CountVectorizer(stop_words=cleanest_companies)
voncount.fit(X_train)
X_train_voncount = voncount.transform(X_train).toarray()
X_test_voncount = voncount.transform(X_test).toarray()

In [1063]:
voncount = CountVectorizer(stop_words=cleanest_companies)
voncount_all = voncount.fit_transform(df['document']).toarray()
voncount_all.shape

(2549, 5077)

In [1064]:
toswords = voncount.get_feature_names()

In [1065]:
all_sum = voncount_all.sum()
good_sum = voncount_all[df['point'] == 'good'].sum()
neutral_sum = voncount_all[df['point'] == 'neutral'].sum()
bad_sum = voncount_all[df['point'] == 'bad'].sum()
print(f'Count of all words: {all_sum} \nCount of words in good documents: {good_sum}\
\nCount of words in netural documents: {neutral_sum} \nCount of words in bad documents: {bad_sum}')

Count of all words: 100436 
Count of words in good documents: 24371
Count of words in netural documents: 25654 
Count of words in bad documents: 48933


In [1066]:
all_count = np.sum(voncount_all,axis=0)
good_count = np.sum(voncount_all[df['point'] =='good'],axis = 0)
neutral_count = np.sum(voncount_all[df['point'] =='neutral'],axis = 0)
bad_count = np.sum(voncount_all[df['point'] =='bad'],axis = 0)

In [1067]:
#atruehonor to meet such a distinguished word

atruehonor = {}
ahonor = {}
afalsehonor = {}
alph = 1.2

for class_word_count,total_word_count,word in zip(good_count,all_count,toswords):
    atruehonor[word] = ((class_word_count**alph)/total_word_count)
for class_word_count,total_word_count,word in zip(neutral_count,all_count,toswords):
    ahonor[word] = ((class_word_count**alph)/total_word_count)
for class_word_count,total_word_count,word in zip(bad_count,all_count,toswords):
    afalsehonor[word] = ((class_word_count**alph)/total_word_count)

In [1068]:
Counter(atruehonor).most_common(15)

[('days', 1.7665314390948643),
 ('want', 1.7427325876521218),
 ('ownership', 1.6996910167106056),
 ('la', 1.6952182030724354),
 ('never', 1.6785540014044966),
 ('nous', 1.6153942662021779),
 ('unsubscribe', 1.6113486821132612),
 ('delete', 1.5907632457820151),
 ('logging', 1.5848931924611134),
 ('rent', 1.5848931924611134),
 ('30', 1.5836862258314204),
 ('deleted', 1.5588992181955612),
 ('easy', 1.5518455739153594),
 ('erase', 1.5518455739153594),
 ('factor', 1.5518455739153594)]

In [1069]:
Counter(ahonor).most_common(15)

[('13', 2.5561596864740617),
 ('age', 2.319414969825008),
 ('years', 2.315420595626855),
 ('2018', 2.2028611071638102),
 ('2019', 2.1117857649667533),
 ('responsible', 2.051052907200627),
 ('old', 2.0188930732512738),
 ('last', 2.0027726320280808),
 ('18', 1.9833726264391651),
 ('jurisdiction', 1.9341084497715753),
 ('updated', 1.9028694297028423),
 ('16', 1.8956420888727294),
 ('older', 1.838416287252544),
 ('california', 1.8285714285714283),
 ('children', 1.82056420302608)]

In [1070]:
Counter(afalsehonor).most_common(15)

[('and', 2.701013004903969),
 ('or', 2.5074970275430384),
 ('beacons', 2.409233475252727),
 ('indemnify', 2.353391971359324),
 ('harmless', 2.3455876685050026),
 ('services', 2.297735728363006),
 ('cookies', 2.281080551985294),
 ('including', 2.28035047020096),
 ('costs', 2.244786134364092),
 ('technologies', 2.2193931289698874),
 ('damages', 2.216661358042455),
 ('of', 2.20620205407551),
 ('claims', 2.1991228900700497),
 ('expenses', 2.1954018974274896),
 ('party', 2.1865924439616062)]

In [1071]:
X_test_tfiddy[X_test_tfiddy != 0]

array([0.18063766, 0.09450074, 0.30470142, ..., 0.10489251, 0.14307376,
       0.10509293])

In [1072]:
#first shitty model!

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=.01)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

0.7931034482758621

In [1073]:
#count vectorizer instead of TFIDF

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=.01)
model.fit(X_train_voncount, y_train)
y_hat = model.predict(X_test_voncount)

model.score(X_test_voncount, y_test)

0.7789968652037618

In [1080]:
#the complement to naive bayes
model = ComplementNB(alpha=0.5)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

model.score(X_test_tfiddy, y_test)

0.8134796238244514

In [1090]:
lostwoods = GradientBoostingClassifier(n_estimators=100)
lostwoods.fit(X_train_tfiddy,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [1086]:
losthats = lostwoods.predict(X_test_tfiddy)

In [1091]:
lostwoods.score(X_test_tfiddy,y_test)

0.7633228840125392

In [None]:
feature_log_prob

In [None]:
model = MultinomialNB(alpha=0.5)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

In [1075]:
#multinomial classifier

tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(X_train)
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

model = MultinomialNB(alpha=0.01)
model.fit(X_train_tfiddy, y_train)
multiy_hat = model.predict(X_test_tfiddy)

model.score(X_test_tfiddy, y_test)
#model.predict_proba(X_test_tfiddy)
#model.classes_

0.7931034482758621

In [1076]:
#multinomial countvectorizer

voncount = CountVectorizer(stop_words=cleanest_companies)
voncount.fit(MX_train)
MX_train_voncount = voncount.transform(MX_train).toarray()
MX_test_voncount = voncount.transform(MX_test).toarray()

model = MultinomialNB(alpha=0.01)
model.fit(MX_train_voncount, My_train)
multiy_hat = model.predict(MX_test_voncount)

model.score(MX_test_voncount, My_test)

0.7640625