In [1]:
import pandas as pd
import numpy as np
import pickle
import math
import numpy as np
import string

In [2]:
'''
STEPS:
1. Read english_omitted_words/dev.tsv
2. Split it into augmented (headlines whose words omitted) and original (intact headlines)
3. For a given frame X, find headlines s.t. gold frameX == True within augmented (indices)
4. Read scores 
5. Get scores for which Step 3 holds among the augmented.
6. Get scores for the intact headlines.
7. Concat the scores that you get in Step 6 to the english.csv to match them to IDs.
8. Read the omitted data and left join the prediction scores to original data on ID.
9. Concat the scores obtained in Step 5 to Step 8. Calculate differences in scores.
10. Strip whitespaces in words. 
11. Group by word and aggregate by mean.
12. Sort by mean.
'''

'\nSTEPS:\n1. Read english_omitted_words/dev.tsv\n2. Split it into augmented (headlines whose words omitted) and original (intact headlines)\n3. For a given frame X, find headlines s.t. gold frameX == True within augmented (indices)\n4. Read scores \n5. Get scores for which Step 3 holds among the augmented.\n6. Get scores for the intact headlines.\n7. Concat the scores that you get in Step 6 to the english.csv to match them to IDs.\n8. Read the omitted data and left join the prediction scores to original data on ID.\n9. Concat the scores obtained in Step 5 to Step 8. Calculate differences in scores.\n10. Strip whitespaces in words. \n11. Group by word and aggregate by mean.\n12. Sort by mean.\n'

In [2]:
english = pd.read_csv('english.csv')

def omit_words(row):
    orig_headline = row.news_title
    orig_theme1 = row['Q3 Theme1']
    orig_theme2 = row['Q3 Theme2']
    orig_id = row.ID
    
    words = orig_headline.split(' ')
    
    new_data_points = []
    for i, w in enumerate(words):
        new_words = words[:i] + words[i+1:]
        new_data_points.append((orig_id, w, orig_headline, ' '.join(new_words), 
                                orig_theme1, orig_theme2))
    return new_data_points

new_data = []
for index, row in english.iterrows():
    #returns list of tuples (ID, omitted_word, original_headline, 
    #omitted_headline, original_frame1, original_frame2)
    new_data.extend(omit_words(row)) 
    
augmented_english = pd.DataFrame(new_data)
augmented_english.columns = ['ID', 'omitted_word', 'original_headline', 
                             'omitted_headline', 'theme1', 'theme2']

In [3]:
exp_name = "../multilabel_multibert_cased_focal3_omitted_words_zero_shot.pkl"

with open(exp_name, 'rb') as f:
    results = pickle.load(f)[0]

In [4]:
# join the predictions to the english dataset
preds_for_english = pd.DataFrame(results[-1300:,], 
            index=english.index, 
            columns=['old_frame1', 'old_frame2', 'old_frame3', 'old_frame4', 
                     'old_frame5', 'old_frame6', 'old_frame7', 'old_frame8', 'old_frame9']
        )
english_with_preds = pd.concat([english, preds_for_english], axis=1)

In [5]:
for frame in range(9):
    # create a df which only contains headlines that have frame from english_with_preds
    cols = ['ID', "old_frame"+str(frame+1)]
    condition = (english_with_preds['Q3 Theme1']==frame+1) | (english_with_preds['Q3 Theme2']==frame+1)
    df = english_with_preds.loc[condition,cols]
    # left join df to augmented english orig_pred_frame 
    augmented_english = augmented_english.merge(df, on='ID',how='left')

In [6]:
len(augmented_english)

15711

In [7]:
results_df = pd.DataFrame(results[:-1300,], 
            index=augmented_english.index, 
            columns=['new_frame1', 'new_frame2', 'new_frame3', 'new_frame4', 
                     'new_frame5', 'new_frame6', 'new_frame7', 'new_frame8', 'new_frame9']
        )
augmented_english = pd.concat([augmented_english, results_df], axis=1)

In [8]:
for frame in range(9):
    old_col_name = "old_frame"+str(frame+1)
    new_col_name = "new_frame"+str(frame+1)
    diff_col_name = "diff_frame"+str(frame+1)
    
    augmented_english[diff_col_name] = augmented_english[old_col_name] - augmented_english[new_col_name]

In [35]:
augmented_english['omitted_word_clean'] = augmented_english['omitted_word'].str.strip().str.strip("’‘“”"+string.punctuation)

In [36]:
def get_top_n_words_by_frame(n, frame):
    col_name = "diff_frame"+str(frame)
    
    return augmented_english[['omitted_word_clean', col_name]].dropna() \
    .groupby(['omitted_word_clean']) \
    .agg({col_name:'mean'}) \
    .sort_values(by=[col_name], ascending=False) \
    .head(n)

In [37]:
for frame in range(1,10):
    get_top_n_words_by_frame(45, frame).to_csv('english_code_switch_words_omitting/frame'+str(frame)+'.csv')

In [38]:
s = pd.Series([])
for frame in range(1,10):
    df = get_top_n_words_by_frame(45, frame)
    df.reset_index(inplace=True)
    s = pd.concat([s, df['omitted_word_clean']])
#     words.extend(df['omitted_word_clean'].tolist())
    


In [39]:
words_unique = sorted(s.unique(), key=str.casefold)

In [40]:
len(words_unique)

387

In [41]:
np.savetxt('english_code_switch_words_omitting/combined_unique.csv', 
           words_unique, fmt='%s')

##### COUNT THE CODE SWITCH WORDS IN THE TRAIN SET FOR ENGLISH

In [1]:
english_words_prev_cs = []
with open("turkish/CodeSwitched/english_words.txt") as fp:
    for line in fp:
        english_words_prev_cs.append(line.strip())
        
english_words_new_cs = []
with open("turkish/OmittedCodeSwitch/english_words.txt") as fp:
    for line in fp:
        english_words_new_cs.append(line.strip())

In [2]:
len(english_words_prev_cs)

358

In [3]:
len(english_words_new_cs)

387

In [4]:
intersect = list(set(english_words_prev_cs) & set(english_words_new_cs))

In [5]:
len(intersect)

70

In [6]:
union = list(set(english_words_prev_cs) | set(english_words_new_cs))

In [7]:
len(union)

675

len(union) == len(english_words_prev_cs) + len(english_words_new_cs) - len(intersect)

In [9]:
with open('union_english_words.txt', 'w') as f:
    for item in sorted(union, key=str.casefold):
        f.write("%s\n" % item)

In [12]:
train = pd.read_csv("turkish/OmittedCodeSwitch/train-Copy1.tsv", sep='\t', header=None)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,36,Cremated remains of Las Vegas mass shooter to ...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47,Florida shooter a troubled loner with white su...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,Vernon Hills teen accused of wearing white sup...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,70,Griffith student charged with accidentally bri...,a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,98,Exclusive: Group chat messages show school sho...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [3]:
for s in train[1]:
    words = s.split(' ')
    if "'re" in words or "'s" in words:
        print(s)


In [23]:
train[1][0] 

'Cremated remains of Las Vegas mass shooter to be kept in safe deposit box, brother says'

In [13]:
total = 0
for sentence in train[1]:
    for word in sentence.split(' '):
        total += 1
print(total)

15711


In [14]:
word_count = 0
for word in english_words_prev_cs:
    for sentence in train[1]:
        if word in sentence.split(' '):
            word_count += 1

print(word_count)
print(word_count/total)


7522
0.47877283431990325


In [15]:
word_count = 0
for word in english_words_new_cs:
    for sentence in train[1]:
        if word in sentence.split(' '):
            word_count += 1

print(word_count)
print(word_count/total)

2121
0.13500095474508306


In [16]:
word_count = 0
for word in union:
    for sentence in train[1]:
        if word in sentence.split(' '):
            word_count += 1

print(word_count)
print(word_count/total)

8139
0.5180446820698873


In [35]:
word_dict = {}
for sentence in train[1]:
    for word in sentence.split(' '):
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1


In [40]:
from collections import Counter

In [41]:
sorted_words = Counter({k: v for k, v in sorted(word_dict.items(), key=lambda item: item[1], reverse=True)})

In [43]:
top_values = [v for k,v in sorted_words.most_common(358)]

In [45]:
sum(top_values)

8349

In [46]:
sorted_words.most_common(358)

[('to', 429),
 ('gun', 331),
 ('of', 244),
 ('shooting', 239),
 ('in', 228),
 ('for', 210),
 ('the', 174),
 ('a', 164),
 ('on', 163),
 ('Gun', 134),
 ('Trump', 132),
 ('and', 129),
 ('NRA', 114),
 ('after', 97),
 ('Shooting', 95),
 ('school', 91),
 ('with', 90),
 ('at', 90),
 ('Florida', 89),
 ('Pittsburgh', 85),
 ('guns', 82),
 ('control', 80),
 ('Parkland', 76),
 ('synagogue', 63),
 ('says', 56),
 ('The', 56),
 ('as', 56),
 ('violence', 55),
 ('by', 55),
 ('School', 53),
 ('mass', 48),
 ('is', 48),
 ('After', 47),
 ('shootings', 41),
 ('students', 38),
 ('from', 36),
 ('over', 35),
 ('victims', 35),
 ('Vegas', 33),
 ('California', 33),
 ('police', 32),
 ('are', 32),
 ('that', 31),
 ('House', 31),
 ('man', 30),
 ('New', 30),
 ('shooter', 29),
 ('have', 29),
 ('shooting:', 29),
 ('Is', 29),
 ('A', 29),
 ('Guns', 29),
 ('not', 28),
 ('Jacksonville', 28),
 ('about', 28),
 ('To', 27),
 ('Synagogue', 27),
 ('laws', 27),
 ('new', 27),
 ('more', 27),
 ('be', 26),
 ('who', 26),
 ('Control', 2

In [59]:
prev_lower = [s.lower() for s in english_words_prev_cs]

In [61]:
len(np.unique(prev_lower))


289