In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn import svm 
from sklearn import pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
ps = PorterStemmer()
stopWords = set(stopwords.words('english'))

In [16]:
caught_ads = []
with open('caught_environment_ads.csv', 'r') as caught_file:
    lines = caught_file.readlines()
    for line in lines:
        if line[0] != '"':
            continue
        word_list = [ps.stem(token) for token in word_tokenize(line[:-1])]
        word_list = [x for x in word_list if x not in stopWords]
        if len(word_list) > 1:
            caught_ads.append(' '.join(word_list))

In [18]:
df = pd.DataFrame(caught_ads, columns=['text'])
df['was_caught'] = True

In [19]:
uncaught_ads = []
with open('undisclosed_environment_ads.csv', 'r') as uncaught_file:
    lines = uncaught_file.readlines()
    for line in lines:
        if line[0] != '"':
            continue
        if line not in caught_ads:
            word_list = [ps.stem(token) for token in word_tokenize(line[:-1])]
            word_list = [x for x in word_list if x not in stopWords]
            if len(word_list) > 1:
                uncaught_text = ' '.join(word_list)
                uncaught_ads.append(pd.Series([uncaught_text, False], index=df.columns))

In [20]:
df = df.append(uncaught_ads, ignore_index=True)

In [21]:
x_train = df
y_train = x_train["was_caught"]
classifier = svm.LinearSVC(C=1.0, class_weight="balanced")
tf_idf = pipeline.Pipeline([
     ('tfidf', TfidfVectorizer()),
     ("classifier", classifier)
 ])
tf_idf.fit(x_train["text"], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', LinearSVC(class_weight='balanced'))])

In [22]:
coefs = tf_idf.named_steps["classifier"].coef_
feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
coefs_and_features = list(zip(coefs[0], feature_names))

In [23]:
# Most positive features
sorted(coefs_and_features, key=lambda x: x[0], reverse=True)

[(1.5982000075917797, 'pollut'),
 (1.3478255649109852, 'ocean'),
 (1.2645736082831918, 'plastic'),
 (1.1780689892963971, 'instal'),
 (1.1268648043842842, 'address'),
 (1.0790088519500136, 'issu'),
 (1.066668723861145, 'support'),
 (1.0419357321930038, 'coalit'),
 (1.0266746678990435, 'latest'),
 (1.00130071220809, 'progress'),
 (0.9982610443696195, 'footprint'),
 (0.9849968623532567, 'respons'),
 (0.9476423986432901, 'experi'),
 (0.9445458368525831, 'opt'),
 (0.9420896727565052, 're'),
 (0.940908918430278, 'eco'),
 (0.9284604752238369, 'great'),
 (0.9099834440885616, 'smog'),
 (0.878869915446395, 'sit'),
 (0.8764743931267616, 'fiction'),
 (0.8676906750746349, 'bio'),
 (0.8647743640811778, 'earth'),
 (0.849808215308624, 'contribut'),
 (0.8308527852616742, 'proud'),
 (0.8296780748102748, 'around'),
 (0.8231766742006499, 'voic'),
 (0.8211476546793165, 'pleas'),
 (0.8063033061753266, 'lessen'),
 (0.8032635463106816, 'number'),
 (0.7977713601886409, 'emili'),
 (0.7951929139326217, 'resid'),

In [24]:
# Most negative features
sorted(coefs_and_features, key=lambda x: x[0])

[(-3.1514775066537792, 'see'),
 (-1.6466974372023324, 'help'),
 (-1.553716756297536, 'school'),
 (-1.5392514818450707, 'recycl'),
 (-1.523520824408771, 'biden'),
 (-1.511263423692173, 'decemb'),
 (-1.4697239224634104, 'night'),
 (-1.4691968098594712, 'suck'),
 (-1.4623388936268114, 'holiday'),
 (-1.4519886988515434, '2021'),
 (-1.4074720757266461, 'fenc'),
 (-1.3705760226326842, 'rv'),
 (-1.347617391168552, 'mateo'),
 (-1.316202545163029, 'altern'),
 (-1.2925243597869331, 'conflict'),
 (-1.2908970606426013, 'els'),
 (-1.2631783007890371, 'old'),
 (-1.2631659644385713, 'stori'),
 (-1.248925709681474, 'remov'),
 (-1.224777894216719, 'subscrib'),
 (-1.2215707635065336, '21'),
 (-1.2213867453558507, 'almond'),
 (-1.2195763796995251, 'easiest'),
 (-1.2157714381107723, 'solv'),
 (-1.1974229452228577, 'cover'),
 (-1.1901775158709593, 'factori'),
 (-1.1879371911988106, 'free'),
 (-1.1720649351730645, '2050'),
 (-1.1598632844216679, 'locat'),
 (-1.1570909123497386, 'filter'),
 (-1.1536289415174

In [25]:
# Most predictive overall
sorted(coefs_and_features, key=lambda x: abs(x[0]), reverse=True)

[(-3.1514775066537792, 'see'),
 (-1.6466974372023324, 'help'),
 (1.5982000075917797, 'pollut'),
 (-1.553716756297536, 'school'),
 (-1.5392514818450707, 'recycl'),
 (-1.523520824408771, 'biden'),
 (-1.511263423692173, 'decemb'),
 (-1.4697239224634104, 'night'),
 (-1.4691968098594712, 'suck'),
 (-1.4623388936268114, 'holiday'),
 (-1.4519886988515434, '2021'),
 (-1.4074720757266461, 'fenc'),
 (-1.3705760226326842, 'rv'),
 (1.3478255649109852, 'ocean'),
 (-1.347617391168552, 'mateo'),
 (-1.316202545163029, 'altern'),
 (-1.2925243597869331, 'conflict'),
 (-1.2908970606426013, 'els'),
 (1.2645736082831918, 'plastic'),
 (-1.2631783007890371, 'old'),
 (-1.2631659644385713, 'stori'),
 (-1.248925709681474, 'remov'),
 (-1.224777894216719, 'subscrib'),
 (-1.2215707635065336, '21'),
 (-1.2213867453558507, 'almond'),
 (-1.2195763796995251, 'easiest'),
 (-1.2157714381107723, 'solv'),
 (-1.1974229452228577, 'cover'),
 (-1.1901775158709593, 'factori'),
 (-1.1879371911988106, 'free'),
 (1.17806898929639

In [26]:
print(len(caught_ads))
print(len(uncaught_ads))

1543
499
