In [1]:
import pandas as pd
import numpy as np
import re
import time

from matplotlib import pyplot as plt
from simhash import Simhash, SimhashIndex

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Import data labelled using Social Animal
df_original = pd.read_csv('data/input/dt_sa_merge_20200609.txt',sep ="|").dropna(subset=['C1'])
print("There are %s observations in the raw dataset" % len(df_original))

#Articles with only one C1 label
df = df_original[[ True if len( re.findall(r'C1',str(item)))==1 else False 
                  for item in df_original['C1'] ]].reset_index(drop=True)
df['C1'] = df['C1'].replace({'C1-Law / Policy Enforcement / Prevention':"C1-Merged",
                             'C1-Study / Report / Commentary':"C1-Merged"}) 
print("There are %s observations with 1 C1 label" % len(df))

#Articles with only one C2 label
df_2 = df_original[[ True if len( re.findall(r'C1',str(item)))==2 else False for item in df_original['C1'] ]]
print("There are %s observations with 2 C1 label" % len(df_2))

There are 2561 observations in the raw dataset
There are 2557 observations with 1 C1 label
There are 4 observations with 2 C1 label


##  Data Preparation

### Remove Mislabeled Articles
Articles with nearly duplicated contents with different C1 labels

In [3]:
def check_duplicates(index, objs):
    dups_ = []
    for i in range(len(objs)):
        dups = index.get_near_dups(objs[i][1])
        if len(dups) > 1:
            dups.sort()
            dups_.append(tuple(dups))
    return list(set(dups_))

def deep_unique(li):
    using_ = li.copy()
    for i in range(len(li)):
        cur_tup = li[i]
        l = len(cur_tup)
        for j in li[i+1:]:
            for k in cur_tup:
                if k in j and l < len(j):
                    if cur_tup in using_:
                        using_.remove(cur_tup)
                    break
    return using_

def check_label(i1, i2):
    return df.C1.iloc[i1] == df.C1.iloc[i2]

content = df.content.to_numpy()
objs = []
for k in range(len(content)):
    objs.append((str(k), Simhash(content[k])))
    
index = SimhashIndex(objs, k=3)
dups = check_duplicates(index, objs)
dups_unique = deep_unique(dups)

consistent = []
inconsistent = []

for dups in dups_unique:
    flag = True
    for j in range(len(dups)):
        if j + 1 < len(dups) and not check_label(int(dups[j]),int(dups[j+1])):
            flag = False
            break
    if flag:
        consistent.append(dups)
    else:
        inconsistent.append(dups)

print(f"There are {len(dups_unique)} duplicated list of articles.")
print(f"Out of them, {len(consistent)} list of articles has consistent labels while {len(inconsistent)} does not.")

from collections import Counter
flat_inconsistent = []
revise_labels = {}
for item in inconsistent:
    if len(item)==2:
        flat_inconsistent.extend(list(item))
    else:
        labels = [df.C1.iloc[int(i)] for i in item ]
        temp = Counter(labels).most_common()[0]
        if temp[1]/len(labels)>0.5:
            for i in item:
                revise_labels[i] = temp[0]
        else:
            flat_inconsistent.extend(list(item)) 
flat_inconsistent = [ item  for item in flat_inconsistent if int(item) not in revise_labels]
remove_inconsistent = [item  for item in df.index if str(item) not in flat_inconsistent]

df_prep1 = df.loc[remove_inconsistent,:]
print("%s inconsistent articles are removed"%len(flat_inconsistent))

flat_inconsistent = [ item  for item in flat_inconsistent if item not in revise_labels]
remove_inconsistent = [item  for item in df.index if str(item) not in flat_inconsistent]

df_prep1 = df.iloc[remove_inconsistent,:]
print("%s inconsistent articles are removed"%len(flat_inconsistent))

for key,item in revise_labels.items():
    df_prep1.loc[int(key)]['C1'] = item
print("After remove and relabel C1 category, there are %s articles" %len(df_prep1))

There are 141 duplicated list of articles.
Out of them, 95 list of articles has consistent labels while 46 does not.
69 inconsistent articles are removed
69 inconsistent articles are removed
After remove and relabel C1 category, there are 2490 articles


### Remove Irrelevant Cateogry Articles

In [4]:
df_prep2 =  df_prep1[[True if item in ['C1-Other', 'C1-Merged', 'C1-Trafficking Case / Story'] else False 
                      for item in df_prep1['C1']]].reset_index(drop = True)
print("After remove irrelevant Cateogry, there are %s articles" %len(df_prep2))

After remove irrelevant Cateogry, there are 2365 articles


### Update Relabeled Articles

In [5]:
df_prep2['C1-Merged'] = ['C1-Merged' ==item for item in df_prep2['C1'].values]
df_prep2['C1-Other'] = ['C1-Other'==item for item in df_prep2['C1'].values]
df_prep2['C1-Trafficking Case/Story'] = ['C1-Trafficking Case / Story'==item for item in df_prep2['C1'].values]

df_newtag = pd.read_json('data/input/sa_human_trafficking_20191217_20200315_20200709_dowjones retagging.json', lines=True)
df_newtag['url'] = [item.split("\n\n\n")[0][1:] for item in df_newtag['content']]

df_newtag['C1-Merged'] = [True if 'C1- Merged' in item['labels'] else False for item in df_newtag['annotation'].values]
df_newtag['C1-Other'] = [True if 'C1- Other' in item['labels'] else False for item in df_newtag['annotation'].values]
df_newtag['C1-Trafficking Case/Story'] = [True if 'C1- Trafficking Case/Story' in item['labels']  
                                          else False for item in df_newtag['annotation'].values]
temp1 = pd.merge(df_prep2.drop(['annotation','C1-Merged','C1-Other','C1-Trafficking Case/Story'],axis = 1),
                    df_newtag[['annotation', 'C1-Merged','C1-Other','C1-Trafficking Case/Story','url']],on='url') 
temp2 = df_prep2[[True if item not in temp1['url'].values else False for item in df_prep2['url']]]
df_prep3 = temp2.append(temp1)

### Remove wordpress content

In [6]:
df_prep4 = df_prep3[[False if type(item) == str and 'wordpress.com' in item else True for item in df_prep3.url.values ]]
print("After remove wordpress content, there are %s articles" %len(df_prep4))
print(df_prep4.sum()[['C1-Merged','C1-Other','C1-Trafficking Case/Story']])

After remove wordpress content, there are 2340 articles
C1-Merged                    1002
C1-Other                      471
C1-Trafficking Case/Story     905
dtype: object


## Preprocessing

In [7]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import unicodedata

In [8]:
def tokenize_lemmatize(doc):
    doc = re.sub("[\xa0]+","",doc)
    words = [v.lower() for v in word_tokenize(doc) if v.isalpha() or v.isdigit()]
    words = [w for w  in words if w not in stopWords] 
    words = [''.join((c for c in unicodedata.normalize('NFD', w) if unicodedata.category(c) != 'Mn')) for w in words]
    lemmatizer = WordNetLemmatizer() 
    words = [lemmatizer.lemmatize(w) for w in words]
    words = ["#number" if w.isdigit() else w for w in words]
    return words

In [9]:
content = df_prep4['content'].values
content = [tokenize_lemmatize(doc) for doc in content]

In [10]:
org_vacab  = Counter([v for item in content for v in item]).most_common() 
print("The orignal vacabuary size is %s" %len(org_vacab))
vocab_ = [item_value[0] for item_value in org_vacab if item_value[1]>5]
print("The refined vacabuary size is %s" %len(vocab_))
content = [" ".join([v for v in item if v in vocab_]) for item in content ]

The orignal vacabuary size is 33060
The refined vacabuary size is 10416


In [11]:
df_new = df_prep4.reset_index(drop=True)
df_new['content'] = content 

## Model Train & Test

TFIDF + Word Count

In [12]:
from sklearn import model_selection, preprocessing, linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [13]:
def model_test(X_test, Y_test, svr):
    predictY = svr.predict(X_test)
    result = sum(predictY == Y_test)

    return float(result)/len(Y_test)

### Trafficking Case / Story

#### Train 

In [14]:
from scipy.sparse import csr_matrix, hstack
df_new['word_count'] = [len(item.split(" ")) for item in df_new['content']]
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_new.content,df_new['C1-Trafficking Case/Story'],
                                                                    test_size = 0.2, random_state = 42) 
tfidf1 = TfidfVectorizer() 
tfidf1.fit(x_train)  
x_train= tfidf1.transform(x_train) 
x_test = tfidf1.transform(x_test) 
x_train =  hstack([x_train,np.array([df_new.iloc[y_train.index.values]['word_count']]).T])
x_test =  hstack([x_test,np.array([df_new.iloc[y_test.index.values]['word_count']]).T])
lr1 = linear_model.LogisticRegression(random_state=123,penalty='l2', solver='lbfgs')
lr1.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

#### Test - Social Animal Data

In [15]:
print(classification_report(y_test, lr1.predict(x_test)))
cm = pd.DataFrame(confusion_matrix(y_test, lr1.predict(x_test) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_test, y_test, lr1)
print("The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.87      0.96      0.91       279
        True       0.93      0.79      0.85       189

   micro avg       0.89      0.89      0.89       468
   macro avg       0.90      0.87      0.88       468
weighted avg       0.89      0.89      0.89       468

        Predicted_0  Predicted_1
True_0          267           12
True_1           40          149
The model accuracy is 0.89


#### Test - Dow Jones Historical Data

In [16]:
df_dj_hist = pd.read_json('data/input/dowjone_dt_sample_20200714.json', lines=True)
df_dj_hist['C1-Merged'] = [True if 'C1- Merged' in item['labels'] else False for item in df_dj_hist['annotation'].values]
df_dj_hist['C1-Other'] = [True if 'C1- Other' in item['labels'] else False for item in df_dj_hist['annotation'].values]
df_dj_hist['C1-Trafficking Case/Story'] = [True if 'C1- Trafficking Case/Story' in item['labels']  
                                          else False for item in df_dj_hist['annotation'].values]

content_test = df_dj_hist['content'].values
content_test = [tokenize_lemmatize(doc) for doc in content_test]
content_test = [" ".join([v for v in item if v in vocab_]) for item in content_test]
df_dj_hist['content'] = content_test
df_dj_hist['word_count'] = [len(item.split(" ")) for item in df_dj_hist['content']]
x_val = tfidf1.transform(content_test) 
y_val = df_dj_hist['C1-Trafficking Case/Story'].values

x_val =  hstack([x_val,np.array( [df_dj_hist['word_count']]).T])
print(classification_report(y_val, lr1.predict(x_val)))

cm = pd.DataFrame(confusion_matrix(y_val, lr1.predict(x_val) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_val, y_val, lr1)
print("The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.91      0.96      0.93        74
        True       0.86      0.73      0.79        26

   micro avg       0.90      0.90      0.90       100
   macro avg       0.89      0.85      0.86       100
weighted avg       0.90      0.90      0.90       100

        Predicted_0  Predicted_1
True_0           71            3
True_1            7           19
The model accuracy is 0.90


#### Test - Dow Jones Phase I data 

In [17]:
df_dj_p1 = pd.read_json('data/input/phase_1_dt_sample_20200720.json', lines=True)
df_dj_p1['C1-Merged'] = [True if 'C1- Merged' in item['labels'] else False for item in df_dj_p1['annotation'].values]
df_dj_p1['C1-Other'] = [True if 'C1- Other' in item['labels'] else False for item in df_dj_p1['annotation'].values]
df_dj_p1['C1-Trafficking Case/Story'] = [True if 'C1- Trafficking Case/Story' in item['labels']  
                                          else False for item in df_dj_p1['annotation'].values]

content_test = df_dj_p1['content'].values
content_test = [tokenize_lemmatize(doc) for doc in content_test]
content_test = [" ".join([v for v in item if v in vocab_]) for item in content_test]
df_dj_p1['content'] = content_test
df_dj_p1['word_count'] = [len(item.split(" ")) for item in df_dj_p1['content']]
x_val = tfidf1.transform(content_test) 
y_val = df_dj_p1['C1-Trafficking Case/Story'].values

x_val =  hstack([x_val,np.array( [df_dj_p1['word_count']]).T])
print(classification_report(y_val, lr1.predict(x_val)))

cm = pd.DataFrame(confusion_matrix(y_val, lr1.predict(x_val) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_val, y_val, lr1)
print("The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.89      0.93      0.91        61
        True       0.89      0.82      0.85        39

   micro avg       0.89      0.89      0.89       100
   macro avg       0.89      0.88      0.88       100
weighted avg       0.89      0.89      0.89       100

        Predicted_0  Predicted_1
True_0           57            4
True_1            7           32
The model accuracy is 0.89


### Law policy prevention & study report commentary
#### Train 

In [18]:
from scipy.sparse import csr_matrix, hstack
df_new['word_count'] = [len(item.split(" ")) for item in df_new['content']]
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_new.content,df_new['C1-Merged'],
                                                                    test_size = 0.2, random_state = 42) 
tfidf2 = TfidfVectorizer() 
x_train = tfidf2.fit_transform(x_train)  
x_test = tfidf2.transform(x_test) 
x_train =  hstack([x_train,np.array([df_new.iloc[y_train.index.values]['word_count']]).T])
x_test =  hstack([x_test,np.array([df_new.iloc[y_test.index.values]['word_count']]).T])
lr2 = linear_model.LogisticRegression(random_state=123,penalty='l2', solver='lbfgs')
lr2.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

#### Test - Social Animal Data

In [19]:
print(classification_report(y_test, lr2.predict(x_test)))
cm = pd.DataFrame(confusion_matrix(y_test, lr2.predict(x_test) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_test, y_test, lr2)
print("The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.86      0.88      0.87       278
        True       0.81      0.78      0.80       190

   micro avg       0.84      0.84      0.84       468
   macro avg       0.84      0.83      0.83       468
weighted avg       0.84      0.84      0.84       468

        Predicted_0  Predicted_1
True_0          244           34
True_1           41          149
The model accuracy is 0.84


#### Test - Dow Jones Historical Data

In [20]:
df_dj_hist = pd.read_json('data/input/dowjone_dt_sample_20200714.json', lines=True)
df_dj_hist['C1-Merged'] = [True if 'C1- Merged' in item['labels'] else False for item in df_dj_hist['annotation'].values]
df_dj_hist['C1-Other'] = [True if 'C1- Other' in item['labels'] else False for item in df_dj_hist['annotation'].values]
df_dj_hist['C1-Trafficking Case/Story'] = [True if 'C1- Trafficking Case/Story' in item['labels']  
                                          else False for item in df_dj_hist['annotation'].values]
df_dj_hist['content'] = content_test
df_dj_hist['word_count'] = [len(item.split(" ")) for item in df_dj_hist['content']]

content_test = df_dj_hist['content'].values
content_test = [tokenize_lemmatize(doc) for doc in content_test]
content_test = [" ".join([v for v in item if v in vocab_]) for item in content_test]
x_val = tfidf2.transform(content_test) 
y_val = df_dj_hist['C1-Merged'].values

x_val =  hstack([x_val,np.array( [df_dj_hist['word_count']]).T])
print(classification_report(y_val, lr2.predict(x_val)))

cm = pd.DataFrame(confusion_matrix(y_val, lr2.predict(x_val) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_val, y_val, lr2)
print("\n The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.59      0.89      0.71        61
        True       0.22      0.05      0.08        39

   micro avg       0.56      0.56      0.56       100
   macro avg       0.41      0.47      0.40       100
weighted avg       0.45      0.56      0.47       100

        Predicted_0  Predicted_1
True_0           54            7
True_1           37            2

 The model accuracy is 0.56


#### Test - Dow Jones Phase I data 

In [21]:
df_dj_p1 = pd.read_json('data/input/phase_1_dt_sample_20200720.json', lines=True)
df_dj_p1['C1-Merged'] = [True if 'C1- Merged' in item['labels'] else False for item in df_dj_p1['annotation'].values]
df_dj_p1['C1-Other'] = [True if 'C1- Other' in item['labels'] else False for item in df_dj_p1['annotation'].values]
df_dj_p1['C1-Trafficking Case/Story'] = [True if 'C1- Trafficking Case/Story' in item['labels']  
                                          else False for item in df_dj_p1['annotation'].values]

content_test = df_dj_p1['content'].values
content_test = [tokenize_lemmatize(doc) for doc in content_test]
content_test = [" ".join([v for v in item if v in vocab_]) for item in content_test]
df_dj_p1['content'] = content_test
df_dj_p1['word_count'] = [len(item.split(" ")) for item in df_dj_p1['content']]
x_val = tfidf2.transform(content_test) 
y_val = df_dj_p1['C1-Merged'].values

x_val =  hstack([x_val,np.array( [df_dj_p1['word_count']]).T])
print(classification_report(y_val, lr2.predict(x_val)))

cm = pd.DataFrame(confusion_matrix(y_val, lr2.predict(x_val) ) )
cm.columns =['Predicted_0','Predicted_1'] 
cm.index =  ['True_0','True_1']
print(cm)
accuracy_test2 = model_test(x_val, y_val, lr2)
print("The model accuracy is %.2f" % accuracy_test2)

              precision    recall  f1-score   support

       False       0.62      0.97      0.75        58
        True       0.78      0.17      0.27        42

   micro avg       0.63      0.63      0.63       100
   macro avg       0.70      0.57      0.51       100
weighted avg       0.68      0.63      0.55       100

        Predicted_0  Predicted_1
True_0           56            2
True_1           35            7
The model accuracy is 0.63


## Save Model

In [22]:
import pickle

In [23]:
pickle.dump(lr1, open(f"model/lr1_trafficking_case_story.pkl","wb"))
pickle.dump(tfidf1, open(f"model/tfidf1_trafficking_case_story.pkl","wb"))
pickle.dump(lr2, open(f"model/lr2_trafficking_merged.pkl","wb"))
pickle.dump(tfidf2, open(f"model/tfidf2_trafficking_merged.pkl","wb"))