In [1]:
from wordcloud import STOPWORDS

from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB

import pandas as pd

##My idea here was that we could build a model to predict which text came from which of the accused. We know that Kaing Guev Eav was the only person charged who admitted wrongdoing. In theory, a model could potentially learn to recognize the testimony of a person who takes responsibility versus those who deny wrongdoing. Or at least what types of testimony may indicate that a person is avoiding or downplaying their part in atrocities.

##I think that we this would likely not be easily accomplished with only this dataset. But the idea is reasonable? The transcripts are word-for-word to actual testimony (that is generally how court documents work). People tend to have different words and cadences that make the way they speak individual. The structure of their speech indicates the speaker. Based on this, the model should be able to pick up on types of things that are said in the Kaing testimony that make it unique from the Chea/Samphan testimony. Extending this to other trials, as long as there is an individual who is considered truthful, all other testimony could be compared against that.

##We also obviously run into the issue of translation. There may be subtleties that do not translate, particularly in languages which don't share the same root language. English and Khmer are certainly far removed in origin on the language tree, so there are definitely things that have been lost in translation. A lot of intent is tied into nuance in language. Tone is not conveyed at all in the transcript. The verbal testimony likely left listeners with a different impression than can ever be learned here.

##Also probably run up against ethical issues. Letting an algorithm determine truth is a dangerous thing. But I envision this more as a tool to inform questioning rather than something that can be used as evidence itself.

##But we will try anyway? --Kyle

In [19]:
GTC_V2 = pd.read_csv('https://raw.githubusercontent.com/MiriamSchirmer/genocide-transcript-corpus/main/Dataset_GTC-V2.csv', sep =';')

In [20]:
GTC_CAMB = GTC_V2.loc[GTC_V2['tribunal']=='ECCC']
GTC_CAMB.head(3)

Unnamed: 0,tribunal,id_transcript,case,accused,date,text,trauma,role,witnesses,n_witnesses,start,id_annotation,id_document,url
16862,ECCC,E1/12.1,Case001,Kaing Guev Eav,2009-04-20 00:00:00 UTC,P R O C E E D I N G S (Judges enter the courtr...,0,Court Proceedings,"[Chan Voeun (KW-31), Chan Khan (KW-32)]",2,0,63b1f39c97ad59b4cfc5793d,63a60e2397ad59b4cfc571ec,http://www.eccc.gov.kh/sites/default/files/doc...
16863,ECCC,E1/12.1,Case001,Kaing Guev Eav,2009-04-20 00:00:00 UTC,MR. PRESIDENT: This morning the Trial Chamber ...,0,JudgeProc,"[Chan Voeun (KW-31), Chan Khan (KW-32)]",2,51,63b1f3a397ad59b4cfc5793e,63a60e2397ad59b4cfc571ec,http://www.eccc.gov.kh/sites/default/files/doc...
16864,ECCC,E1/12.1,Case001,Kaing Guev Eav,2009-04-20 00:00:00 UTC,"THE GREFFIER: Your Honours, Mr. President, the...",0,Court Proceedings,"[Chan Voeun (KW-31), Chan Khan (KW-32)]",2,267,63b1f3a797ad59b4cfc5793f,63a60e2397ad59b4cfc571ec,http://www.eccc.gov.kh/sites/default/files/doc...


In [23]:
GTC_CAMB = GTC_CAMB.loc[~GTC_CAMB['role'].str.contains('Accused')]

In [25]:
X = GTC_CAMB['text']
y = GTC_CAMB['accused']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
my_stop = list(STOPWORDS) #Using our own custom stopwords, maybe it will help?
my_stop.append('„')
my_stop.append('“')
my_stop.append('’')
my_stop.append('ðž')
my_stop.append('ð¹å¬')
my_stop.append('ðµä•ð')
my_stop.append('ðµä')
my_stop.append('ð')
my_stop.append('î')
my_stop.append('ëå†ä¯')
my_stop.append('ë')
my_stop.append('å†ä¯')
my_stop.append('åÿ')
my_stop.append('å¬')
my_stop.append('åªë')
my_stop.append('åªå†ð¹â…œåµå™ðµä')
my_stop.append('åªä®ð¹å†î‰ëâ')
my_stop.append('åªä®ð¹')
my_stop.append('åªä®')
my_stop.append('ää³ðš')
my_stop.append('äâ‚§åÿ	')
my_stop.append('äâ‚§')
my_stop.append('ä')
my_stop.append('ã‰lisabeth	')
my_stop.append('â€')
my_stop.append('äâ‚§åÿ')
my_stop.append('ã‰lisabeth')
my_stop.append('â…œâ‚¤äœ')
my_stop.append('â‚£ã°ä')
my_stop.append('â‚£')
my_stop.append('®åªä³')
my_stop.append('c')
my_stop.append('kw')

In [27]:
text_pipeline = Pipeline([
     ('tokenize', CountVectorizer(max_features=10000, stop_words = my_stop)),
     ('tfidf', TfidfTransformer(use_idf=True)),
     ('classifier', SGDClassifier())
    ])

In [28]:
text_pipeline.fit(X_train, y_train)



In [29]:
y_pred_test = text_pipeline.predict(X_test)
y_pred_train = text_pipeline.predict(X_train)

In [30]:
y_train.value_counts(normalize = True)
#Null accuracy is 51%
#Always guessing that text comes from Kaing gives best accuracy (not very accurate though)

accused
Kaing Guev Eav               0.506978
Nuon Chea & Khieu Samphan    0.493022
Name: proportion, dtype: float64

In [31]:
metrics.accuracy_score(y_train, y_pred_train)
#This beats the null accuracy, so our model meaningfully predicts the Accused

0.9333280165882447

In [32]:
metrics.accuracy_score(y_test, y_pred_test)
#Test accuracy is bigger than null accuracy (as desired)
#It is also fairly close to the training accuracy (not overfit)

0.8858054226475279

In [33]:
print(metrics.classification_report(y_test, y_pred_test, zero_division=0))

#So if the model predicts that text is from Kaing, 87% of the time it is correct.
#And it is able to identify 90% off all Kaing text in the dataset

#If it predicts that text is from the Chea and Samphan case, 89% of the time it is correct.
#And it is able to identify 86% off all of their text in the dataset

#These percentages are based on using the SGD Classifier with 10000 features

                           precision    recall  f1-score   support

           Kaing Guev Eav       0.88      0.90      0.89      1630
Nuon Chea & Khieu Samphan       0.89      0.87      0.88      1505

                 accuracy                           0.89      3135
                macro avg       0.89      0.89      0.89      3135
             weighted avg       0.89      0.89      0.89      3135



##Using the my_stop custom list of stopwords increased the precision for Kaing by 1% and the recall for Chea/Samphan by 1% as well.