# Demo notebook for *Identification of Dialect for Eastern and Southwestern Ojibwe Words Using a Small Corpus*
### AmericasNLP 2023 submission

Imports

In [1]:
import re
import os
from itertools import chain
import numpy as np
import operator
import copy
import pickle
import pandas as pd
from operator import itemgetter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import  linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline



Helper function definitions

In [2]:

#loading code for different data formats 

#onj is journal - lots of stories in one text file
def readonj(file):
    with open(file) as inputFile:
        text = inputFile.read()
        sentences = text.replace(',','').replace('(','').replace('-',' ').replace(')','').replace('"','').replace(';',' ').replace(':','').replace('“','').replace('!','').replace('?','').replace('\n','').replace('”','').replace(':','').lower().split('.')
    return sentences[:-1] #remove final split dangler

#eastern dialect stories are one story per file
def reade(file):
    with open(file) as inputFile:
        text = inputFile.read()
        text = re.sub(r'\d', '', text) #remove digits
        sentences = text.replace('(','').replace(';',' ').replace('-',' ').replace(')','').replace(',','').replace(':','').replace('“','').replace('!','').replace('?','').replace('\n','').replace('”','').replace(':','').lower().split('.')
    return sentences[:-1] #remove final split dangler




#ngram generators
def word2ngrams(text, n=3):
    return [text[i:i+n] for i in range(len(text)-n+1)]

def sent2ngrams_counted(text, n=3):
    nGrams = list(chain(*[word2ngrams(i,n) for i in text.replace('.','').lower().split()]))
    return nGrams
    
def TF_nGrams(nGrams):
    wordSet = set(nGrams)
    wordDict = dict.fromkeys(wordSet, 0) 
    for word in nGrams:
        wordDict[word]+=1/len(nGrams)
    return wordDict

def sorted_nGram_dict(wordDict):
    sorted_dict = sorted(wordDict.items(), key=operator.itemgetter(1),reverse=True)
    return sorted_dict


def compute_n_grams(sentences,n,start_n=2):
    all_ngrams = {}
    for nn in np.arange(start_n,n+1):
        ngrams = []
        if type(sentences) is str:
            ngrams.extend(sent2ngrams_counted(sentences,n=nn))
        else:
            for sentence in sentences:
                ngrams.extend(sent2ngrams_counted(sentence,n=nn))
            #normalize per ngram size
        all_ngrams = {**all_ngrams, **TF_nGrams(ngrams)}
    return all_ngrams

Because text files are not shared with this demo notebook, demonstration of model on word list is the only live part of the shared code. The rest of the notebook is shared as documentation to assist others in the future who want to replicate this work.

In [3]:
#Load pre-trained model
with open("model.pkl", "rb") as f:
    clf = pickle.load(f)

In [4]:
#Load initialized empty dictionary to have all n-grams found
with open('saved_nGram_dictionary.pkl', 'rb') as f:
    initialized_nGram_dict_sw_e = pickle.load(f)

In [5]:
#Load word list and create labels - use 1 for SW and 0 for E
df = pd.read_csv('./data/fiftyOjibweWordsE_SW.csv')
words = df['Ojibwe SW'].to_list() + df['Ojibwe E/Odawa'].to_list()
labels = list(np.ones((50))) + list(np.zeros((50)))

In [6]:
#Create features for words and load into dictionary with all n-grams
ngrams_count = 4
featurized_words =  np.zeros((len(initialized_nGram_dict_sw_e),len(words)))
for index,sentence in enumerate(featurized_words.T):
    features = compute_n_grams(words[index],ngrams_count)
    new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
    for something in features.keys():
            if something in new_dict.keys():
                new_dict[something] = features[something]
    featurized_words[:,index] = np.fromiter(new_dict.values(), dtype=float)
    

#Make predictions on words    
word_results = clf.predict(featurized_words.T)

#Run reports on results
print(classification_report(labels,word_results))
print(confusion_matrix(labels,word_results))

              precision    recall  f1-score   support

         0.0       0.71      0.72      0.71        50
         1.0       0.71      0.70      0.71        50

    accuracy                           0.71       100
   macro avg       0.71      0.71      0.71       100
weighted avg       0.71      0.71      0.71       100

[[36 14]
 [15 35]]


For those reading the paper, please note that we have manually removed the 3 repeated words from the SW Ojibwe counts in the confusion matrix and calculation of accuracy in the paper.

## Everything after this point is meant for documentation and replication only and is not meant to be a live demo

Data loading code, commented because we don't have permission to reproduce full texts

In [7]:
# # SW data is all combined into one text file
# file = './data/sw/onj8_1.txt'
# southwestern_sentences = readonj(file)
# print(len(southwestern_sentences) + 'total sentences')
# print(np.mean([len(a.split(' ')) for a in southwestern_sentences]) + 'average words per sentence')

In [8]:
# # E data is separate files per story
# directory = './data/e'
# files = os.listdir(directory)
# eastern_sentences = []
# for file in files:
#     if os.path.isfile(directory + '/' + file):
#         eastern_sentences.extend(reade(directory + '/' +file))
# print(len(eastern_sentences) + 'total sentences')
# print(np.mean([len(a.split(' ')) for a in eastern_sentences]) + 'average words per sentence')

In [9]:
# # Load all data to capture all n-grams so we have everything to start.
# # Commented because stories are not shared
 
# all_sw_ngrams = compute_n_grams(southwestern_sentences,ngrams_count,start_n=1)
# all_e_ngrams = compute_n_grams(eastern_sentences,ngrams_count,start_n=1)

Combine all the ngrams possible and create a mega list so that we can properly initialize stuff

In [10]:

# combined_keys = list(TF_nGrams(list(all_sw_ngrams.keys()) + list(all_e_ngrams.keys())).keys())
# len(combined_keys)

In [11]:
# initialized_nGram_dict = {key: 0 for key in combined_keys}

In [12]:
# combined_keys = list(TF_nGrams(list(all_sw_ngrams.keys()) + list(all_e_ngrams.keys())))
# print(len(combined_keys))
# initialized_nGram_dict_sw_e = {key: 0 for key in combined_keys}

Create combined and labeled dataset

In [13]:
# X = eastern_sentences + southwestern_sentences
# Y = np.concatenate((np.zeros(len(eastern_sentences)),np.ones(len(southwestern_sentences))))

Build model

In [14]:
# clf = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber was initially tested, didn't work well



In [15]:
# #5 fold cross validation
# skf = StratifiedKFold(n_splits=5, shuffle = True)

# #save weights and average for final model
# all_weights = np.zeros((len(initialized_nGram_dict_sw_e),))


# #initialize feature space    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e),len(X)))

# #build features
# for index,sentence in enumerate(featurized_x.T):
#     features = compute_n_grams(X[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# #run the cross validation
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
#     clf.fit(featurized_x[:,train_index].T, Y[train_index])
    
#     single_sentence_inputs.extend(clf.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y[test_index])
#     all_weights += np.asarray(clf[1].coef_[0])

# #compute and display combined results
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)
# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))


# #show most influential n-grams
# weight_dict = dict(zip(list(initialized_nGram_dict_sw_e.keys()),all_weights/5))
# print(sorted_nGram_dict(weight_dict)[:10])
# print(sorted_nGram_dict(weight_dict)[-10:])

Repeat for 2 sentence groups

In [16]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# # skf.get_n_splits(X, Y)
# all_weights = np.zeros((len(initialized_nGram_dict_sw_e),))


# X2 = [ x+' '+y for x,y in zip(eastern_sentences[0::2], eastern_sentences[1::2]) ] + [ x+y for x,y in zip(southwestern_sentences[0::2], southwestern_sentences[1::2]) ]
# Y2 = np.concatenate((np.zeros(len(eastern_sentences[0::2])),np.ones(len(southwestern_sentences[0::2]))))



# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e),len(X2)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x.T):
#     features = compute_n_grams(X2[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X2, Y2)):
#     clf.fit(featurized_x[:,train_index].T, Y2[train_index])
    
#     single_sentence_inputs.extend(clf.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y2[test_index])
#     all_weights += np.asarray(clf[1].coef_[0])
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))



# weight_dict = dict(zip(list(initialized_nGram_dict_sw_e.keys()),all_weights/5))
# print(sorted_nGram_dict(weight_dict)[:10])
# print(sorted_nGram_dict(weight_dict)[-10:])

Repeat for 3 sentence groups

In [17]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# # skf.get_n_splits(X, Y)
# all_weights = np.zeros((len(initialized_nGram_dict_sw_e),))


# p1 = [ x+' '+y +' ' + z for x,y,z in zip(eastern_sentences[0::3], eastern_sentences[1::3], eastern_sentences[2::3]) ]
# p2 = [  x+' '+y +' ' + z for x,y,z in zip(southwestern_sentences[0::3], southwestern_sentences[1::3], southwestern_sentences[2::3]) ]
# X3 = p1 + p2

# Y3 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e),len(X3)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x.T):
#     features = compute_n_grams(X3[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X3, Y3)):
#     clf.fit(featurized_x[:,train_index].T, Y3[train_index])
    
#     single_sentence_inputs.extend(clf.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y3[test_index])
#     all_weights += np.asarray(clf[1].coef_[0])
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))



# weight_dict = dict(zip(list(initialized_nGram_dict_sw_e.keys()),all_weights/5))
# print(sorted_nGram_dict(weight_dict)[:10])
# print(sorted_nGram_dict(weight_dict)[-10:])

Repeat for 4 sentence groups

In [18]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# # skf.get_n_splits(X, Y)
# all_weights = np.zeros((len(initialized_nGram_dict_sw_e),))

# p1 = [ x+' '+y +' ' + z+' ' + a for x,y,z,a in zip(eastern_sentences[0::4], eastern_sentences[1::4], eastern_sentences[2::4], eastern_sentences[3::4]) ]
# p2 = [ x+' '+y +' ' + z+' ' + a for x,y,z,a in zip(southwestern_sentences[0::4], southwestern_sentences[1::4], southwestern_sentences[2::4], southwestern_sentences[3::4]) ]
# X4 = p1 + p2

# Y4 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e),len(X4)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x.T):
#     features = compute_n_grams(X4[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X4, Y4)):
#     clf.fit(featurized_x[:,train_index].T, Y4[train_index])
    
#     single_sentence_inputs.extend(clf.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y4[test_index])
#     all_weights += np.asarray(clf[1].coef_[0])
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))



# weight_dict = dict(zip(list(initialized_nGram_dict_sw_e.keys()),all_weights/5))
# print(sorted_nGram_dict(weight_dict)[:10])
# print(sorted_nGram_dict(weight_dict)[-10:])

Repeat for 5 sentence groups

In [19]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# # skf.get_n_splits(X, Y)
# all_weights = np.zeros((len(initialized_nGram_dict_sw_e),))

# p1 = [ x+' '+y +' ' + z+' ' + a+' ' + b  for x,y,z,a,b in zip(eastern_sentences[0::5], eastern_sentences[1::5], eastern_sentences[2::5], eastern_sentences[3::5], eastern_sentences[4::5]) ]
# p2 = [ x+' '+y +' ' + z+' ' + a+' ' + b for x,y,z,a,b in zip(southwestern_sentences[0::5], southwestern_sentences[1::5], southwestern_sentences[2::5], southwestern_sentences[3::5], southwestern_sentences[4::5]) ]
# X5 = p1 + p2

# Y5 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e),len(X5)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x.T):
#     features = compute_n_grams(X5[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X5, Y5)):
#     clf.fit(featurized_x[:,train_index].T, Y5[train_index])
    
#     single_sentence_inputs.extend(clf.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y5[test_index])
#     all_weights += np.asarray(clf[1].coef_[0])
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))



# weight_dict = dict(zip(list(initialized_nGram_dict_sw_e.keys()),all_weights/5))
# print(sorted_nGram_dict(weight_dict)[:10])
# print(sorted_nGram_dict(weight_dict)[-10:])

Original word test based on 5 sentence model

In [20]:
# df = pd.read_csv('./data/fiftyOjibweWordsE_SW.csv')

In [21]:
# words = df['Ojibwe SW'].to_list() + df['Ojibwe E/Odawa'].to_list()
# labels = list(np.ones((50))) + list(np.zeros((50)))

In [22]:
# featurized_words =  np.zeros((len(initialized_nGram_dict_sw_e),len(words)))
# for index,sentence in enumerate(featurized_words.T):
#     features = compute_n_grams(words[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_words[:,index] = np.fromiter(new_dict.values(), dtype=float)
    

    
# word_results = clf.predict(featurized_words.T)
# print(classification_report(labels,word_results))
# print(confusion_matrix(labels,word_results))

In [23]:
#sloppy export of results for copy/paste into results spreadsheet
# for i in np.arange(100):
# #     print(words[i],word_results[i],words[i+50],word_results[i+50])
#     if word_results[i]==1:
#         print('SW')
#     else:
#         print('E')
        

Build truncated n_gram dictionary

In [24]:
# top100_e_ngrams = dict(sorted_nGram_dict(all_e_ngrams)[:100])

In [25]:
# top100_sw_ngrams = dict(sorted_nGram_dict(all_sw_ngrams)[:100])

In [26]:
def dict_key_overlap(dict1,dict2):
    overlap_count = 0
    overlap_keys = []
    all_keys = []
    for ngram in dict2.keys():
        all_keys.append(ngram)
        if ngram in dict1.keys():
            overlap_count +=1
            overlap_keys.append(ngram)
    for ngram in dict1.keys():
        all_keys.append(ngram)
    all_keys = list(set(all_keys))
    return overlap_count, overlap_keys, all_keys

Compute overlap size for truncated dictionaries

In [27]:
# for size in [100,500,1000]:
#     overlap, shared_keys, all_keys = dict_key_overlap(dict(sorted_nGram_dict(all_e_ngrams)[:size]),dict(sorted_nGram_dict(all_sw_ngrams)[:size]))
#     print(size)
#     print(overlap)
#     print(len(all_keys))
    
    

Initialize small truncated dictionary

In [28]:
# overlap, shared_keys, all_keys = dict_key_overlap(top100_e_ngrams,top100_sw_ngrams)

# initialized_nGram_dict_sw_e_100 = {key: 0 for key in all_keys}

In [29]:
# len(initialized_nGram_dict_sw_e_100)

Single sentence test

In [30]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)


# clf2 = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber


    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e_100),len(X)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x):
#     features = compute_n_grams(X[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e_100)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
#     clf2.fit(featurized_x[:,train_index].T, Y[train_index])
    
#     single_sentence_inputs.extend(clf2.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y[test_index])
    
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))


Two sentence test

In [31]:
# X2 = [ x+' '+y for x,y in zip(eastern_sentences[0::2], eastern_sentences[1::2]) ] + [ x+y for x,y in zip(southwestern_sentences[0::2], southwestern_sentences[1::2]) ]
# Y2 = np.concatenate((np.zeros(len(eastern_sentences[0::2])),np.ones(len(southwestern_sentences[0::2]))))

In [32]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# skf.get_n_splits(X2, Y2)

# clf2 = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber


    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e_100),len(X2)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x):
#     features = compute_n_grams(X2[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e_100)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X2, Y2)):
#     clf2.fit(featurized_x[:,train_index].T, Y2[train_index])
    
#     single_sentence_inputs.extend(clf2.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y2[test_index])
    
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))

Three sentence test

In [33]:
# p1 = [ x+' '+y +' ' + z for x,y,z in zip(eastern_sentences[0::3], eastern_sentences[1::3], eastern_sentences[2::3]) ]
# p2 = [  x+' '+y +' ' + z for x,y,z in zip(southwestern_sentences[0::3], southwestern_sentences[1::3], southwestern_sentences[2::3]) ]
# X3 = p1 + p2

# Y3 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

In [34]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# skf.get_n_splits(X3, Y3)

# clf2 = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber


    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e_100),len(X3)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x):
#     features = compute_n_grams(X3[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e_100)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X3, Y3)):
#     clf2.fit(featurized_x[:,train_index].T, Y3[train_index])
    
#     single_sentence_inputs.extend(clf2.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y3[test_index])
    
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))

Four sentence test

In [35]:
# p1 = [ x+' '+y +' ' + z+' ' + a for x,y,z,a in zip(eastern_sentences[0::4], eastern_sentences[1::4], eastern_sentences[2::4], eastern_sentences[3::4]) ]
# p2 = [ x+' '+y +' ' + z+' ' + a for x,y,z,a in zip(southwestern_sentences[0::4], southwestern_sentences[1::4], southwestern_sentences[2::4], southwestern_sentences[3::4]) ]
# X4 = p1 + p2

# Y4 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

In [36]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# skf.get_n_splits(X4, Y4)

# clf2 = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber


    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e_100),len(X4)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x):
#     features = compute_n_grams(X4[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e_100)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X4, Y4)):
#     clf2.fit(featurized_x[:,train_index].T, Y4[train_index])
    
#     single_sentence_inputs.extend(clf2.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y4[test_index])
    
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results))
# print(confusion_matrix(truth_labels,results))

Five sentence test with truncated feature set

In [37]:
# p1 = [ x+' '+y +' ' + z+' ' + a+' ' + b  for x,y,z,a,b in zip(eastern_sentences[0::5], eastern_sentences[1::5], eastern_sentences[2::5], eastern_sentences[3::5], eastern_sentences[4::5]) ]
# p2 = [ x+' '+y +' ' + z+' ' + a+' ' + b for x,y,z,a,b in zip(southwestern_sentences[0::5], southwestern_sentences[1::5], southwestern_sentences[2::5], southwestern_sentences[3::5], southwestern_sentences[4::5]) ]
# X5 = p1 + p2

# Y5 = np.concatenate((np.zeros(len(p1)),np.ones(len(p2))))

In [38]:
# skf = StratifiedKFold(n_splits=5, shuffle = True)
# skf.get_n_splits(X5, Y5)
# all_weights = np.zeros((118,))
# clf2 = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge')) #loss huber


    
# featurized_x =  np.zeros((len(initialized_nGram_dict_sw_e_100),len(X5)))
# # featurized_x = compute_n_grams(X,ngrams_count)
# for index,sentence in enumerate(featurized_x):
#     features = compute_n_grams(X5[index],ngrams_count)
#     new_dict = copy.deepcopy(initialized_nGram_dict_sw_e_100)
#     for something in features.keys():
#             if something in new_dict.keys():
#                 new_dict[something] = features[something]
#     featurized_x[:,index] = np.fromiter(new_dict.values(), dtype=float)
    
# single_sentence_inputs = []
# single_sentence_labels = []
# for i, (train_index, test_index) in enumerate(skf.split(X5, Y5)):
#     clf2.fit(featurized_x[:,train_index].T, Y5[train_index])
    
#     single_sentence_inputs.extend(clf2.predict(featurized_x[:,test_index].T))
#     single_sentence_labels.extend(Y5[test_index])
#     all_weights += np.asarray(clf2[1].coef_[0])
    
# results = np.asarray(single_sentence_inputs).reshape(-1)
# truth_labels = np.asarray(single_sentence_labels).reshape(-1)

# print(classification_report(truth_labels,results,target_names=['e','sw']))
# print(confusion_matrix(truth_labels,results))


In [39]:
# # dictionaries can be searched
# all_sw_ngrams['wi']

In [40]:
# all_e_ngrams['wi']