In [1]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [5]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "tobacco"

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/cleaned_data.csv")

In [8]:
df_ml.head()

Unnamed: 0,ID,title,runtime,transcript
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes mr trudeau composition tha...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...


In [9]:
df_human.head()

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41
1,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17
2,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51
3,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40
4,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20


In [10]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [11]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
    AND subject NOT LIKE '%href%'
""")

In [12]:
len(df_dataset)

324

In [13]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...


In [14]:
#df_dataset.set_index("ID", inplace=True)

In [15]:
# what is this doing and can i use python instead? i can't figure out how to
# selecting ID, subject, transcript
# adding category column when subject includes wanted tag, then 1, else 0

# df = pysqldf(f"""
# SELECT 
#     ID,
#     subject,  
#     {text_source},
#     CASE
#         WHEN subject LIKE '%lawsuit%' THEN 1
#         ELSE 0
#     END AS category
# FROM df_dataset
# --WHERE subject LIKE '%lawsuit%'
# """)

df = df_dataset[['ID', 'subject', 'transcript']]
category = []
for row in df['subject']:
    if tag in row:
        category.append(1)
    else:
        category.append(0)
df['category'] = category
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = category


Unnamed: 0,ID,subject,transcript,category
0,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
1,fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,1
2,fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
3,fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,1
4,fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,1
5,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,1
6,fhhh0100,tobacco; cigarette; break; relief;,ever heard man literally walk wall human fly well looking hundred ...,1
7,fhhj0223,tobacco; cigarette; youth; regulation;,top story tonight indiana retail associations joining forces keep ...,1
8,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell remember detail happened iraq benefits drug used okay force r...,0
9,fjhb0039,tobacco; cigarette;quality; advertising;,welcome winston bull comedy break guys gonna right watching gerald...,1


In [16]:
df.set_index("ID", inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,1
fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,1
fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,1


In [18]:
# number of videos that have chosen tag
(df.category == 1).sum()

292

In [19]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [20]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [21]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [22]:
text_clf['vect'].get_feature_names_out()

array(['aaron', 'abandoned', 'abbot', ..., 'zones', 'zoo', 'zoom'],
      dtype=object)

In [23]:
#X_train.todense()

In [24]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [25]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [26]:
#df_train.iloc[0][text_source]

In [27]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [28]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [29]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,mallinckrodt,0.015265
1,distributor,0.010646
2,visit,0.010528
3,wholesaler,0.010299
4,monitoring,0.010148
5,specifically,0.009931
6,opioid,0.009654
7,prescribed,0.008348
8,territory,0.008331
9,specific,0.008261


In [30]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
6734,mallinckrodt,0.01526518
3347,distributor,0.01064573
12013,visit,0.01052817
12225,wholesaler,0.01029948
7185,monitoring,0.01014782
10407,specifically,0.009930987
7672,opioid,0.009654206
8506,prescribed,0.008347886
11187,territory,0.008330623
10406,specific,0.00826095


In [31]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [32]:
cross_val

array([0.95121951, 0.92682927, 0.9       , 0.95      ])

In [33]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [34]:
y_proba

array([[0.01, 0.99],
       [0.06, 0.94],
       [0.05, 0.95],
       [0.  , 1.  ],
       [0.02, 0.98],
       [0.16, 0.84],
       [0.01, 0.99],
       [0.07, 0.93],
       [0.  , 1.  ],
       [0.03, 0.97],
       [0.18, 0.82],
       [0.  , 1.  ],
       [0.04, 0.96],
       [0.03, 0.97],
       [0.02, 0.98],
       [0.07, 0.93],
       [0.07, 0.93],
       [0.7 , 0.3 ],
       [0.04, 0.96],
       [0.17, 0.83],
       [0.06, 0.94],
       [0.06, 0.94],
       [0.01, 0.99],
       [0.02, 0.98],
       [0.16, 0.84],
       [0.06, 0.94],
       [0.02, 0.98],
       [0.53, 0.47],
       [0.13, 0.87],
       [0.05, 0.95],
       [0.09, 0.91],
       [0.11, 0.89],
       [0.06, 0.94],
       [0.04, 0.96],
       [0.02, 0.98],
       [0.4 , 0.6 ],
       [0.16, 0.84],
       [0.  , 1.  ],
       [0.76, 0.24],
       [0.08, 0.92],
       [0.04, 0.96],
       [0.  , 1.  ],
       [0.27, 0.73],
       [0.  , 1.  ],
       [0.06, 0.94],
       [0.  , 1.  ],
       [0.04, 0.96],
       [0.17,

In [35]:
text_clf.classes_

array([0, 1])

In [36]:
prob_no_match = [p[0] for p in y_proba]
prob_match = [p[1] for p in y_proba]

In [37]:
df_test['prob_no_match'] = prob_no_match
df_test['prob_match'] = prob_match
df_test['y_pred'] = y_pred

In [38]:
# rename y_match?
y_adj = []

for p in prob_match:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Match")
    
df_test['y_adj'] = y_adj

In [39]:
df_test[['category', 'y_pred', 'y_adj', 'prob_no_match', 'prob_match']]

Unnamed: 0_level_0,category,y_pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zkhb0039,1,1,Match,0.01,0.99
yrhb0039,1,1,Match,0.06,0.94
xxxn0085,1,1,Match,0.05,0.95
ypgp0190,1,1,Match,0.0,1.0
zjhv0183,1,1,Match,0.02,0.98
tywh0182,1,1,Match,0.16,0.84
jrcf0191,1,1,Match,0.01,0.99
xsbg0182,1,1,Match,0.07,0.93
jpgl0191,1,1,Match,0.0,1.0
jfhb0039,1,1,Match,0.03,0.97


In [40]:
pysqldf("""SELECT * FROM df_test WHERE `y_pred` != category""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj
0,jhwm0256,opioids; physician interview; constipation; sales and marketing,famouser constipation big thing ovulates one smart half oxycodone ...,0,0.18,0.82,1,Match
1,zgwm0256,opioids; physician interview; sales and marketing,okay quick look saw yesterday well moa think see i m assuming neph...,0,0.09,0.91,1,Match
2,jzhh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,media number three deposition tithing time monitor p m back record...,0,0.4,0.6,1,Match
3,xrgp0190,alcohol; commerical; advertisement; promotion,vinyl horse year s hits team thunder thunder maybe next year welco...,0,0.0,1.0,1,Match
4,tqph0257,opioids; sales and marketing; fentora; pharmaceutical industry,news reports coming across country effervescence peed indications ...,0,0.17,0.83,1,Match
5,xzhh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,want show i m going marked exhibit number is page numbers wrong ye...,0,0.23,0.77,1,Match
6,yfxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,back record p m mr ross recall testimony earlier today regarding m...,0,0.25,0.75,1,Match
7,ghwm0256,opioids; interview; physician; sales and marketing,let s address cost issue couple moments talk yesterday want ask co...,0,0.11,0.89,1,Match
8,kfwm0256,opioids; sales and marketing; focus group,reported numbers program take somebody important want work unders...,0,0.0,1.0,1,Match
9,xhwm0256,opioids; sales and marketing; physician interview; costs,make comment rep saying kind addressed upfront gave little preview...,0,0.12,0.88,1,Match


In [41]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj,transcript.1,category.1
0,zkhb0039,tobacco; cigarette; commerical; marketing; new product,adorable hi i m barbara husband bill thinks i m crazy putting tell...,1,0.01,0.99,1,Match,adorable hi i m barbara husband bill thinks i m crazy putting tell...,1
1,yrhb0039,tobacco; cigarette; training; sales,welcome retail conversion sales team objective sales team tell adu...,1,0.06,0.94,1,Match,welcome retail conversion sales team objective sales team tell adu...,1
2,xxxn0085,tobacco; cigarette; African American; advertising; bar; public pla...,benson hedges th live rocket make funky start biggest names rhythm...,1,0.05,0.95,1,Match,benson hedges th live rocket make funky start biggest names rhythm...,1
3,ypgp0190,tobacco; cigarette,word alternate sponsor parliament cigarettes man knows value extra...,1,0.0,1.0,1,Match,word alternate sponsor parliament cigarettes man knows value extra...,1
4,zjhv0183,tobacco; cigarette,big may ask couple questions think tried people it always think i ...,1,0.02,0.98,1,Match,big may ask couple questions think tried people it always think i ...,1
5,tywh0182,tobacco; cigarette,every single program confrontation confrontation dental medical ex...,1,0.16,0.84,1,Match,every single program confrontation confrontation dental medical ex...,1
6,jrcf0191,tobacco; cigarette; fresh;,stepped long limousine said sir arthur smith beverly enjoy i m smo...,1,0.01,0.99,1,Match,stepped long limousine said sir arthur smith beverly enjoy i m smo...,1
7,xsbg0182,tobacco; cigarette; sales; marketing; employee video; training,prepared time psychic hotline mysterious questions future answered...,1,0.07,0.93,1,Match,prepared time psychic hotline mysterious questions future answered...,1
8,jpgl0191,tobacco; cigarette; nicotine; addiction; regulation;,disappointed little information available big tobacco companies sa...,1,0.0,1.0,1,Match,disappointed little information available big tobacco companies sa...,1
9,jfhb0039,tobacco; cigarette; taste; smokeless; ash,always implant barbara creative crushed consider well cigarette bu...,1,0.03,0.97,1,Match,always implant barbara creative crushed consider well cigarette bu...,1


In [42]:
y_pred = text_clf.predict(df_test[text_source])

In [43]:
# everything below this line will vary based on the run

In [44]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.936 / Recall: 1.0 / Accuracy: 0.938 / fscore: 0.967


In [45]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score 0.9668874172185431
