In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# The tidytext package isn't in Google Colaboratory's default list of packages, so we install it first
!pip install tidytext
import tidytext
# This library is used by tidytext for tokenization
import nltk
nltk.download('punkt')

# For making word clouds (unsurprisingly)
from wordcloud import WordCloud, STOPWORDS

Collecting tidytext
  Downloading tidytext-0.0.1.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting siuba (from tidytext)
  Downloading siuba-0.4.4-py3-none-any.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.6/208.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: tidytext
  Building wheel for tidytext (setup.py) ... [?25l[?25hdone
  Created wheel for tidytext: filename=tidytext-0.0.1-py3-none-any.whl size=3871 sha256=5b103e78651f79fd3675317429442e806d4de5ee1760f761073c2673e3f1b1ea
  Stored in directory: /root/.cache/pip/wheels/88/40/40/04f8d22d7729547afa13c2cbffb494737351dd4465f2f26288
Successfully built tidytext
Installing collected packages: siuba, tidytext
Successfully installed siuba-0.4.4 tidytext-0.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
GTC = pd.read_csv('https://raw.githubusercontent.com/MiriamSchirmer/genocide-transcript-corpus/main/Dataset_GTC-V2.csv', sep =';')

In [3]:
GTC.head(3)

Unnamed: 0,tribunal,id_transcript,case,accused,date,text,trauma,role,witnesses,n_witnesses,start,id_annotation,id_document,url
0,ICTR,TRS16748R0000629886,ICTR-98-44-T,Callixte Nzabonimana,2006-02-22 00:00:00 UTC,"MR. PRESIDENT: Good morning, everybody. Mr. Re...",0,JudgeProc,[UB],1,23,63d000d797ad59b4cfc626f2,639a47b297ad59b4cfc56852,https://ucr.irmct.org/LegalRef/CMSDocStore/Pub...
1,ICTR,TRS16748R0000629886,ICTR-98-44-T,Callixte Nzabonimana,2006-02-22 00:00:00 UTC,"MR. HOMETOWU: Thank you, Mr. President. Trial ...",0,Court Proceedings,[UB],1,120,63d0010e97ad59b4cfc62704,639a47b297ad59b4cfc56852,https://ucr.irmct.org/LegalRef/CMSDocStore/Pub...
2,ICTR,TRS16748R0000629886,ICTR-98-44-T,Callixte Nzabonimana,2006-02-22 00:00:00 UTC,"MR. PRESIDENT: Appearances, please.",0,JudgeProc,[UB],1,525,63d000fd97ad59b4cfc626ff,639a47b297ad59b4cfc56852,https://ucr.irmct.org/LegalRef/CMSDocStore/Pub...


In [15]:
GTC.columns

Index(['tribunal', 'id_transcript', 'case', 'accused', 'date', 'text',
       'trauma', 'role', 'witnesses', 'n_witnesses', 'start', 'id_annotation',
       'id_document', 'url'],
      dtype='object')

In [13]:
GTCAdj = tidytext.unnest_tokens(GTC, 'word', 'text')
GTCAdj = GTCAdj[['tribunal', 'word']]

In [None]:
GTCAdj.head(3)

Unnamed: 0,tribunal,word
0,ICTR,mr
0,ICTR,president
0,ICTR,good


In [None]:
counts = GTCAdj.groupby('tribunal')['word'].value_counts()
counts.name = 'n'
counts = counts.reset_index()

In [None]:
counts.head()

Unnamed: 0,tribunal,word,n
0,ECCC,the,42133
1,ECCC,to,22419
2,ECCC,and,18655
3,ECCC,i,17540
4,ECCC,you,16522


In [None]:
my_stop = list(STOPWORDS)
my_stop.append('q')
my_stop.append('mr')
my_stop.append('zvornikmilicibratunac')
my_stop.append('thank')
my_stop.append('president')

In [None]:
count_clean = counts.loc[~counts['word'].isin(my_stop)]

In [None]:
count_clean.sort_values('n', ascending=True)

Unnamed: 0,tribunal,word,n
35423,ICTY,zune,1
11127,ECCC,khmerthai,1
11128,ECCC,khmertype,1
11129,ECCC,khmner,1
11130,ECCC,khmuoch,1
...,...,...,...
23272,ICTY,witness,2582
41,ECCC,time,2722
23266,ICTY,yes,2946
23263,ICTY,judge,3022


In [None]:
tfidf = tidytext.bind_tf_idf(count_clean, 'word', 'tribunal', 'n')
tfidf = tfidf.fillna(0)

In [None]:
tfidf.head()

Unnamed: 0,tribunal,word,n,tf,idf,tf_idf
41,ECCC,time,2722,0.009344,0.0,0.0
45,ECCC,know,2372,0.008143,0.0,0.0
46,ECCC,people,2368,0.008129,0.0,0.0
49,ECCC,witness,2228,0.007648,0.0,0.0
53,ECCC,said,1830,0.006282,0.0,0.0


In [None]:
tfidf.loc[tfidf['tf_idf']!=0]

Unnamed: 0,tribunal,word,n,tf,idf,tf_idf
88,ECCC,person,1052,0.003611,1.098612,0.003967
109,ECCC,work,806,0.002767,0.405465,0.001122
111,ECCC,asked,800,0.002746,0.405465,0.001114
123,ECCC,duch,681,0.002338,1.098612,0.002568
124,ECCC,heard,679,0.002331,1.098612,0.002561
...,...,...,...,...,...,...
34998,ICTY,terraces,1,0.000004,1.098612,0.000005
34999,ICTY,terrifying,1,0.000004,1.098612,0.000005
35000,ICTY,territorially,1,0.000004,1.098612,0.000005
35001,ICTY,terrorising,1,0.000004,1.098612,0.000005


In [None]:
tfidf = tfidf.sort_values('tf_idf', ascending=False)

In [None]:
tfidf.head()

Unnamed: 0,tribunal,word,n,tf,idf,tf_idf
23272,ICTY,witness,2582,0.010872,1.098612,0.011944
13175,ICTR,will,1350,0.005894,1.098612,0.006475
13198,ICTR,well,1006,0.004392,1.098612,0.004825
88,ECCC,person,1052,0.003611,1.098612,0.003967
13218,ICTR,prosecutor,762,0.003327,1.098612,0.003655


In [None]:
tfidf.groupby('tribunal')['tf_idf'].idxmax()

tribunal
ECCC       88
ICTR    13175
ICTY    23272
Name: tf_idf, dtype: int64

In [None]:
tfidf.loc[tfidf.groupby('tribunal')['tf_idf'].idxmax()]

Unnamed: 0,tribunal,word,n,tf,idf,tf_idf
88,ECCC,person,1052,0.003611,1.098612,0.003967
13175,ICTR,will,1350,0.005894,1.098612,0.006475
23272,ICTY,witness,2582,0.010872,1.098612,0.011944


##ta means grandfather / or refers to an ancestor##

In [None]:
ECCCTF = tfidf.loc[tfidf['tribunal']=='ECCC']
ECCTF = ECCCTF.loc[ECCCTF['tf_idf']!=0]

In [None]:
ECCCTF.head()

Unnamed: 0,tribunal,word,n,tf,idf,tf_idf
88,ECCC,person,1052,0.003611,1.098612,0.003967
123,ECCC,duch,681,0.002338,1.098612,0.002568
124,ECCC,heard,679,0.002331,1.098612,0.002561
148,ECCC,remember,608,0.002087,1.098612,0.002293
181,ECCC,worked,505,0.001734,1.098612,0.001905
