In [1]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [5]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "news report"

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/cleaned_data.csv")

In [8]:
df_ml.head()

Unnamed: 0,ID,title,runtime,transcript
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes mr trudeau composition tha...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...


In [9]:
df_human.head()

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41
1,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17
2,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51
3,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40
4,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20


In [10]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [11]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
    AND subject NOT LIKE '%href%'
""")

In [12]:
len(df_dataset)

324

In [13]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...


In [14]:
#df_dataset.set_index("ID", inplace=True)

In [15]:
# what is this doing and can i use python instead? i can't figure out how to
# selecting ID, subject, transcript
# adding category column when subject includes wanted tag, then 1, else 0

# df = pysqldf(f"""
# SELECT 
#     ID,
#     subject,  
#     {text_source},
#     CASE
#         WHEN subject LIKE '%lawsuit%' THEN 1
#         ELSE 0
#     END AS category
# FROM df_dataset
# --WHERE subject LIKE '%lawsuit%'
# """)

df = df_dataset[['ID', 'subject', 'transcript']]
category = []
for row in df['subject']:
    if tag in row:
        category.append(1)
    else:
        category.append(0)
df['category'] = category
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = category


Unnamed: 0,ID,subject,transcript,category
0,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
1,fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,0
2,fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
3,fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
4,fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0
5,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,0
6,fhhh0100,tobacco; cigarette; break; relief;,ever heard man literally walk wall human fly well looking hundred ...,0
7,fhhj0223,tobacco; cigarette; youth; regulation;,top story tonight indiana retail associations joining forces keep ...,0
8,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell remember detail happened iraq benefits drug used okay force r...,0
9,fjhb0039,tobacco; cigarette;quality; advertising;,welcome winston bull comedy break guys gonna right watching gerald...,0


In [16]:
df.set_index("ID", inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,0
fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0


In [18]:
# number of videos that have chosen tag
(df.category == 1).sum()

15

In [19]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [20]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [21]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [22]:
text_clf['vect'].get_feature_names_out()

array(['aaron', 'abandon', 'abandoned', ..., 'zoom', 'zooming', 'zucker'],
      dtype=object)

In [23]:
#X_train.todense()

In [24]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [25]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [26]:
#df_train.iloc[0][text_source]

In [27]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [28]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [29]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,shorten,0.028222
1,residents,0.017295
2,heads,0.01441
3,diseases,0.014212
4,lighter,0.013963
5,longevity,0.013905
6,pregnant,0.013143
7,longer,0.013028
8,physiologist,0.012728
9,costs,0.012506


In [30]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
10227,shorten,0.028222
9470,residents,0.017295
5275,heads,0.01441
3299,diseases,0.014212
6573,lighter,0.013963
6708,longevity,0.013905
8633,pregnant,0.013143
6706,longer,0.013028
8319,physiologist,0.012728
2598,costs,0.012506


In [31]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [32]:
cross_val

array([0.92682927, 0.92682927, 0.95      , 0.925     ])

In [33]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [34]:
y_proba

array([[0.96, 0.04],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.96, 0.04],
       [0.96, 0.04],
       [0.98, 0.02],
       [0.94, 0.06],
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.95, 0.05],
       [1.  , 0.  ],
       [0.97, 0.03],
       [0.97, 0.03],
       [0.9 , 0.1 ],
       [0.9 , 0.1 ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.97, 0.03],
       [0.98, 0.02],
       [0.96, 0.04],
       [0.98, 0.02],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.97, 0.03],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.97, 0.03],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.98, 0.02],
       [0.97, 0.03],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.95, 0.05],
       [0.99, 0.01],
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.98, 0.02],
       [1.  , 0.  ],
       [1.  ,

In [35]:
text_clf.classes_

array([0, 1])

In [36]:
prob_no_match = [p[0] for p in y_proba]
prob_match = [p[1] for p in y_proba]

In [37]:
df_test['prob_no_match'] = prob_no_match
df_test['prob_match'] = prob_match
df_test['y_pred'] = y_pred

In [38]:
# rename y_match?
y_adj = []

for p in prob_match:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Match")
    
df_test['y_adj'] = y_adj

In [39]:
df_test[['category', 'y_pred', 'y_adj', 'prob_no_match', 'prob_match']]

Unnamed: 0_level_0,category,y_pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
flhb0039,0,0,Not,0.96,0.04
tmgp0190,0,0,Not,1.0,0.0
zknh0063,0,0,Not,0.98,0.02
hzhh0257,0,0,Not,0.96,0.04
ghxb0079,0,0,Not,0.96,0.04
zngl0191,0,0,Not,0.98,0.02
zrfp0190,1,0,Not,0.94,0.06
fsfp0190,0,0,Not,0.98,0.02
jrhb0039,0,0,Not,1.0,0.0
hzgb0039,0,0,Not,0.95,0.05


In [40]:
pysqldf("""SELECT * FROM df_test WHERE `y_pred` != category""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj
0,zrfp0190,tobacco; cigarette; news report; science,surgeon general says cigarette smoking nation s chief cause preven...,1,0.94,0.06,0,Not
1,xrfp0190,tobacco; cigarette; news report; study,new medical study shows smoking low tar cigarettes less harmful sm...,1,0.97,0.03,0,Not
2,zpwc0072,tobacco; cigarette; news report;,cnbc shareholder plenty muscle promises keep pressure rjr nabisco ...,1,0.97,0.03,0,Not
3,xmhb0039,tobacco; cigarette; news report; new product; interview,minutes hour chattanooga tennessee new kind cigarette market manu...,1,0.99,0.01,0,Not
4,zlpj0015,tobacco; cigarette; news report; new product,big lips burn tobacco heat riddle says hot carbon tip sins warm ai...,1,1.0,0.0,0,Not
5,xpgl0191,tobacco; cigarette; news report,okay drama news latest government report cigarettes show manufactu...,1,0.95,0.05,0,Not
6,yyhb0039,tobacco; cigarette; news report; advertising; demographics; sales,know war words waged cigarettes continuing series titled smoking g...,1,0.97,0.03,0,Not
7,zpgp0190,tobacco; cigarette; news report; companies,growing evidence product killing people eventually led fan broadca...,1,0.97,0.03,0,Not
8,yqgl0191,tobacco; cigarette; news report; government,research side saying time smokers suspected years smoking addictiv...,1,0.99,0.01,0,Not
9,zjhb0039,tobacco; cigarette; news report; sales,well several months ago moneyline first report rj reynolds develop...,1,1.0,0.0,0,Not


In [41]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj,transcript.1,category.1
0,flhb0039,tobacco; cigarette; filter; taste;,science foundation technology technology nothing less applied scie...,0,0.96,0.04,0,Not,science foundation technology technology nothing less applied scie...,0
1,tmgp0190,tobacco; cigarette; promotion; advertising,well first half good neighbors brought by remove three hundred hea...,0,1.0,0.0,0,Not,well first half good neighbors brought by remove three hundred hea...,0
2,zknh0063,tobacco; cigarette; sales; marketing;,thank jane good morning would like use last minutes prior lunch re...,0,0.98,0.02,0,Not,thank jane good morning would like use last minutes prior lunch re...,0
3,hzhh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record good afternoon mrs collier name tricia spell i m attor...,0,0.96,0.04,0,Not,back record good afternoon mrs collier name tricia spell i m attor...,0
4,ghxb0079,tobacco; cigarette; secondhand smoke;,morning would like begin process believe long overdue would like b...,0,0.96,0.04,0,Not,morning would like begin process believe long overdue would like b...,0
5,zngl0191,tobacco; cigarette; news; interview,suppose tremendous amount research going including tobacco industr...,0,0.98,0.02,0,Not,suppose tremendous amount research going including tobacco industr...,0
6,zrfp0190,tobacco; cigarette; news report; science,surgeon general says cigarette smoking nation s chief cause preven...,1,0.94,0.06,0,Not,surgeon general says cigarette smoking nation s chief cause preven...,1
7,fsfp0190,tobacco; cigarette; nicotine; addiction; health; unhealthy;,new accusations tobacco companies misleading public dangers smokin...,0,0.98,0.02,0,Not,new accusations tobacco companies misleading public dangers smokin...,0
8,jrhb0039,tobacco; cigarette; marketing; flavor;,name carol little bit something tail emotion product really really...,0,1.0,0.0,0,Not,name carol little bit something tail emotion product really really...,0
9,hzgb0039,tobacco; cigarette; tax; profit; regulation;,actual cohort study hariyama cohort study one study least show ide...,0,0.95,0.05,0,Not,actual cohort study hariyama cohort study one study least show ide...,0


In [42]:
y_pred = text_clf.predict(df_test[text_source])

In [43]:
# everything below this line will vary based on the run

In [44]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.0 / Recall: 0.0 / Accuracy: 0.932 / fscore: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score nan


  print('f-score', 2 * ((precision * recall) / (precision + recall)))


In [46]:
# note: is 15 matching videos not enough? probably not bc only about 5% of df