In [1]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import font_manager
import re
from pathlib import Path
from tqdm import tqdm
import json
import pickle
import numpy as np
import collections
import jieba
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics  import f1_score,accuracy_score, confusion_matrix

In [3]:
%ls ../CRECIL/Final_Data

dev.json  shuffle_data.py  test.json  train.json


In [2]:
with open('../CRECIL/Final_Data/train.json','rb') as infile:
    train_df = json.loads(infile.read())

In [3]:
with open('../CRECIL/Final_Data/dev.json','rb') as infile:
    dev_df = json.loads(infile.read())

In [4]:
with open('../CRECIL/Final_Data/test.json','rb') as infile:
    test_df = json.loads(infile.read())

### Question 1: 
how does per:alternate_name get predicted, if we have masked names? <br/>
**Answer**: "The model's input consists of a dialogue and a character entity pair to be recognized". I take this to mean that the pairs of entities within the dialogue are given, so models don't have to recreate the pairs from the dialogue itself.

In [5]:
def get_blank_relations(annotations:list) -> list:
    """
    Take the labels and clear out the gold-standard relations, 
    to be filled with predictions by model
    """
    pred_list =[]
    
    for item in annotations:
        copy = item.copy()
        copy['r'] = []
        copy['rid'] = []
        pred_list.append(copy)
    
    return pred_list


def ch_tokenizer(input_str:str):
    #tokenize sentence and return as list
    tokenized = list(jieba.cut(input_str))
    return tokenized

def get_num_speakers(transcript:list)-> int:
    """
    return number of speakers in scene
    """
    ch_set = set()
    for line in transcript:
        ch_set.add(re.findall('S.*(?=:)',line)[0])
    
    total = len(ch_set)
    return total

In [10]:
rid_to_rel = Counter() #given an rid, return the relation
rel_to_rid = dict() #given a relation, return the rid

for item in train_df:
    for rel in item[1]:
        
        for i in range(0,len(rel['r'])):
            if rel['r'][i] not in rel_to_rid:
                rel_to_rid[rel['r'][i]] = rel['rid'][i]
            if rel['rid'][i] not in rid_to_rel:
                rid_to_rel[rel['rid'][i]] = rel['r'][i]
                
rid_to_rel = collections.OrderedDict(sorted(rid_to_rel.items()))

In [66]:
with open('rid_to_rel.pickle','wb') as outfile:
    pickle.dump(rid_to_rel,outfile,protocol=pickle.HIGHEST_PROTOCOL)

with open('rel_to_rid.pickle','wb') as outfile:
    pickle.dump(rel_to_rid,outfile,protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open('rid_to_rel.pickle','rb') as infile:
    rid_to_rel = pickle.load(infile)

with open('rel_to_rid.pickle','rb') as infile:
    rel_to_rid = pickle.load(infile)

Watch [this link](https://www.google.com/search?q=add+features+feature+engineering+one-hot+encoding+with+bert&oq=add+features+feature+engineering+one-hot+encoding+with+bert&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIHCAEQIRigAdIBCTExODI1ajBqN6gCALACAA&sourceid=chrome&ie=UTF-8#fpstate=ive&vld=cid:66503114,vid:NbbsVcs42jE)

Analysis: make a correlation matrix among the 32 relations. Surely per:parent and per:child must occur together frequently

In [38]:
total_dev = 0
for i in range(0,len(dev_df)):
    total_dev += len(dev_df[i][1])
total_dev

7422

In [39]:
dev_df[0][1]

[{'x': 'S 1', 'y': 'S 2', 'r': ['per:children'], 'rid': [1]},
 {'x': 'S 1',
  'y': 'S 3',
  'r': ['per:friends', 'per:acquaintance'],
  'rid': [11, 3]},
 {'x': 'S 1', 'y': '小姑', 'r': ['per:relative'], 'rid': [29]},
 {'x': 'S 1',
  'y': '朝阳叔叔',
  'r': ['per:friends', 'per:acquaintance'],
  'rid': [11, 3]},
 {'x': 'S 1', 'y': '小桂阿姨', 'r': ['unanswerable'], 'rid': [31]},
 {'x': 'S 1',
  'y': '朝阳',
  'r': ['per:friends', 'per:acquaintance'],
  'rid': [11, 3]},
 {'x': 'S 2', 'y': 'S 1', 'r': ['per:parents'], 'rid': [2]},
 {'x': 'S 2', 'y': 'S 3', 'r': ['per:acquaintance'], 'rid': [3]},
 {'x': 'S 2', 'y': '小姑', 'r': ['per:relative'], 'rid': [29]},
 {'x': 'S 2', 'y': '朝阳叔叔', 'r': ['per:acquaintance'], 'rid': [3]},
 {'x': 'S 2', 'y': '小桂阿姨', 'r': ['unanswerable'], 'rid': [31]},
 {'x': 'S 2', 'y': '朝阳', 'r': ['per:acquaintance'], 'rid': [3]},
 {'x': 'S 3', 'y': 'S 1', 'r': ['per:friends'], 'rid': [11]},
 {'x': 'S 3', 'y': 'S 2', 'r': ['per:acquaintance'], 'rid': [3]},
 {'x': 'S 3', 'y': '小姑', '

In [37]:
len(dev_df)

116

In [13]:
len(rid_to_rel)

32

In [86]:
#create empty data frame
zero_data = np.zeros(shape=(len(train_df),len(rid_to_rel)))
gt_df = pd.DataFrame(zero_data, columns=rid_to_rel.values())

for i in range(0,len(train_df)):
    for rel_item in train_df[i][1]:
        for j in rel_item['rid']:
            gt_df.iat[i,j] = 1

In [100]:
Counter(gt_df['per:alternate_name'])

Counter({1.0: 429, 0.0: 53})

In [88]:
corr_matrix = gt_df.corr()

In [89]:
corr_matrix

Unnamed: 0,per:alternate_name,per:children,per:parents,per:acquaintance,per:client,per:colleague,per:ex-girlfriend,per:girlfriend,per:dates,per:ex-boyfriend,...,per:siblings,per:spouse,per:grandparents,per:grandchildren,per:teacher,per:student,per:roommate,per:relative,per:siblings-in-law,unanswerable
per:alternate_name,1.0,0.11707,0.11707,0.055997,0.08046,0.085296,0.069229,0.07868,0.063102,0.069229,...,0.069274,0.108441,0.092696,0.092696,0.048484,0.048484,0.016026,0.095654,0.042669,0.090971
per:children,0.11707,1.0,1.0,-0.006048,-0.03397,-0.037705,-0.12809,-0.150463,-0.115819,-0.12809,...,0.26816,0.51623,0.402224,0.402224,-0.02047,-0.02047,0.027564,0.271393,0.034214,0.099455
per:parents,0.11707,1.0,1.0,-0.006048,-0.03397,-0.037705,-0.12809,-0.150463,-0.115819,-0.12809,...,0.26816,0.51623,0.402224,0.402224,-0.02047,-0.02047,0.027564,0.271393,0.034214,0.099455
per:acquaintance,0.055997,-0.006048,-0.006048,1.0,-0.043498,-0.021592,-0.022164,-0.013124,0.058844,-0.022164,...,0.039423,0.093797,0.092402,0.092402,0.069985,0.069985,-0.019263,-0.043313,-0.051286,0.027149
per:client,0.08046,-0.03397,-0.03397,-0.043498,1.0,0.099027,-0.045087,-0.051243,-0.058975,-0.045087,...,-0.081649,-0.08755,-0.056349,-0.056349,-0.031576,-0.031576,-0.010438,-0.1023,-0.027789,0.131067
per:colleague,0.085296,-0.037705,-0.037705,-0.021592,0.099027,1.0,-0.04839,-0.034804,0.043607,-0.04839,...,-0.139865,-0.025027,-0.025115,-0.025115,-0.055904,-0.055904,-0.018479,-0.099105,0.249716,0.149115
per:ex-girlfriend,0.069229,-0.12809,-0.12809,-0.022164,-0.045087,-0.04839,1.0,0.77721,0.538054,1.0,...,-0.00568,-0.064143,-0.031815,-0.031815,0.134502,0.134502,-0.008981,0.045175,-0.02391,0.112771
per:girlfriend,0.07868,-0.150463,-0.150463,-0.013124,-0.051243,-0.034804,0.77721,1.0,0.506325,0.77721,...,-0.054955,-0.09722,-0.069105,-0.069105,0.041021,0.041021,-0.010207,0.005557,-0.027174,0.128167
per:dates,0.063102,-0.115819,-0.115819,0.058844,-0.058975,0.043607,0.538054,0.506325,1.0,0.538054,...,-0.020943,-0.10731,-0.059057,-0.059057,0.027903,0.027903,-0.011747,0.05303,-0.031275,0.08776
per:ex-boyfriend,0.069229,-0.12809,-0.12809,-0.022164,-0.045087,-0.04839,1.0,0.77721,0.538054,1.0,...,-0.00568,-0.064143,-0.031815,-0.031815,0.134502,0.134502,-0.008981,0.045175,-0.02391,0.112771


In [96]:
sort(corr_matrix['per:boss'])

[('per:acquaintance', -0.08208033022655042),
 ('per:alternate_name', 0.04169164806752479),
 ('per:boss', 1.0),
 ('per:boyfriend', 0.168066785272256),
 ('per:children', 0.0328757046813823),
 ('per:children-in-law', 0.132118874953544),
 ('per:classmate', 0.03727483722089884),
 ('per:client', -0.034528935855916675),
 ('per:colleague', -0.10766179874679062),
 ('per:dates', 0.09941876083041462),
 ('per:ex-boyfriend', 0.17851966356077914),
 ('per:ex-girlfriend', 0.17851966356077914),
 ('per:friends', -0.09633133550269961),
 ('per:girlfriend', 0.168066785272256),
 ('per:grandchildren', 0.1253900217410204),
 ('per:grandparents', 0.1253900217410204),
 ('per:negative impression', 0.10295738431118671),
 ('per:neighbor', -0.025789516339307476),
 ('per:nickname', -0.0254849056436838),
 ('per:nurse', 0.7588329056007194),
 ('per:parents', 0.0328757046813823),
 ('per:parents-in-law', 0.132118874953544),
 ('per:positive impression', -0.019749396565728463),
 ('per:relative', 0.38772870409144783),
 ('per

Here's a show-specific variable: dialogue-size. I wonder if the rarer relation labels aren't more common only in larger group settings than smaller?

In [116]:
get_num_speakers(train_df[0][0])

4

In [None]:
#copy code from prev work. 

Main deliberation: these sets of y-variables are co-dependent. Some are very likely to occur together, but it seems harder to model a scenario where a model predicts 5 out of 10 relations, than individually asking: is this relation present or not? 

I'm going to start with the latter problem because I know how to set it up, and then I'm going to research and brainstorm how to set up the first problem.

Short term goal: set up a model that can make predictions based on basic features, and be able to run the evaluation script on it. 

**Step 1**: make a gt data frame with present relations for each dialogue

In [19]:
zero_data = np.zeros(shape=(len(train_df),len(rid_to_rel))) #change length
train_gt = pd.DataFrame(zero_data, columns=rid_to_rel.values())

for i in range(0,len(train_df)):
    for rel_item in train_df[i][1]:
        for j in rel_item['rid']:
            train_gt.iat[i,j] = 1

zero_data = np.zeros(shape=(len(dev_df),len(rid_to_rel)))
dev_gt = pd.DataFrame(zero_data, columns=rid_to_rel.values())

for i in range(0,len(dev_df)):
    for rel_item in dev_df[i][1]:
        for j in rel_item['rid']:
            dev_gt.iat[i,j] = 1

zero_data = np.zeros(shape=(len(test_df),len(rid_to_rel)))
test_gt = pd.DataFrame(zero_data, columns=rid_to_rel.values())

for i in range(0,len(test_df)):
    for rel_item in test_df[i][1]:
        for j in rel_item['rid']:
            test_gt.iat[i,j] = 1

In [9]:
train_array = []
for i in range(0,len(train_df)):
    
    temp = ""
    for row in train_df[i][0]:
        temp += row + '\n'
    train_array.append(temp)

dev_array = []
for i in range(0,len(dev_df)):
    
    temp = ""
    for row in dev_df[i][0]:
        temp += row + '\n'
    dev_array.append(temp)
    
test_array = []
for i in range(0,len(test_df)):
    
    temp = ""
    for row in test_df[i][0]:
        temp += row + '\n'
    test_array.append(temp)

In [12]:
cv = CountVectorizer(tokenizer=ch_tokenizer,ngram_range=(1,3),min_df=2,max_df=0.55)
#train_array,
count_vector=cv.fit_transform(train_array)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.459 seconds.
Prefix dict has been built successfully.


In [13]:
X_train = count_vector.toarray()

In [18]:
X_dev_count = cv.transform(dev_array)
X_dev = X_dev_count.toarray()

X_test_count = cv.transform(test_array)
X_test = X_test_count.toarray()

In [21]:
#rename for consistency
y_train = train_gt
y_test = test_gt

In [28]:
len(dev_gt)

116

In [24]:
# Create feature matrix

#split data if needed
y = train_gt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)




In [29]:
y_train.columns

Index(['per:alternate_name', 'per:children', 'per:parents', 'per:acquaintance',
       'per:client', 'per:colleague', 'per:ex-girlfriend', 'per:girlfriend',
       'per:dates', 'per:ex-boyfriend', 'per:boyfriend', 'per:friends',
       'per:nickname', 'per:neighbor', 'per:nurse', 'per:parents-in-law',
       'per:children-in-law', 'per:positive impression', 'per:classmate',
       'per:negative impression', 'per:subordinate', 'per:boss',
       'per:siblings', 'per:spouse', 'per:grandparents', 'per:grandchildren',
       'per:teacher', 'per:student', 'per:roommate', 'per:relative',
       'per:siblings-in-law', 'unanswerable'],
      dtype='object')

In [22]:
mnb = MultinomialNB()
y_pred = mnb.fit(X_train, y_train['per:roommate']).predict(X_test)

In [23]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test['per:roommate'] != y_pred).sum()))
Counter(y_pred) 

Number of mislabeled points out of a total 71 points : 0


Counter({0.0: 71})

In [33]:

#fit
SVM = SVC(kernel = 'linear')
SVM.fit(X_train,y_train['per:roommate'])
y_pred=SVM.predict(X_dev)

In [34]:
print("Number of mislabeled points out of a total %d points : %d"% (X_dev.shape[0], (dev_gt['per:roommate'] != y_pred).sum()))
Counter(y_pred)

Number of mislabeled points out of a total 116 points : 0


Counter({0.0: 116})

In [29]:
#evaluate
print(accuracy_score(y_test['per:roommate'],y_pred))
print(f1_score(y_test['per:roommate'],y_pred))
confusion_matrix(y_test['per:roommate'], y_pred) #even more the case than the Logistic Regression

1.0
0.0


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


array([[71]])

In [None]:

neg_class_prob_sorted = clf.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = clf.feature_log_prob_[1, :].argsort()[::-1]

In [None]:


"""
clf.classes_  gives array(['tu', 'vous'], dtype='<U4')
so the second one, 1, is vous, and that's what's getting shown in the 
pos_class
"""
print(clf.classes_)
print(np.take(count.get_feature_names(), neg_class_prob_sorted[:10]))
print(np.take(count.get_feature_names(), pos_class_prob_sorted[:10])) 