In [1]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import pandas as pd

In [2]:
dataset = 'data/DSL-TRAIN.txt'

In [3]:
# Create a set of all characters in the entire dataset 
charset = set()
with open(dataset, encoding='utf-8') as f:
    for line in f:
        charset.update(line)

In [4]:
charset

{'\t',
 '\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\x7f',
 '\xa0',
 '¡',
 '¢',
 '£',
 '¤',
 '¥',
 '¦',
 '§',
 '¨',
 'ª',
 '«',
 '¬',
 '®',
 '¯',
 '°',
 '²',
 '³',
 '´',
 '¶',
 '·',
 'º',
 '»',
 '¼',
 '½',
 '¾',
 '¿',
 'À',
 'Á',
 'Â',
 'Ã',
 'Å',
 'Ç',
 'È',
 'É',
 'Ê',
 'Ë',
 'Í',
 'Î',
 'Ð',
 'Ñ',
 'Ó',
 'Ô',
 'Õ',
 'Ö',
 '×',
 'Ø',
 'Ù',
 'Ú',
 'Ü',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ñ',
 'ò',
 '

In [5]:
# Examine the character counts of one particular line
from collections import Counter
from collections import OrderedDict
Counter(line).most_common()

[(' ', 40),
 ('e', 26),
 ('o', 25),
 ('a', 21),
 ('i', 18),
 ('n', 18),
 ('r', 14),
 ('s', 13),
 ('m', 12),
 ('u', 12),
 ('t', 12),
 ('j', 11),
 ('d', 11),
 ('v', 10),
 ('p', 9),
 ('l', 7),
 ('b', 5),
 ('z', 5),
 ('š', 4),
 (',', 4),
 ('k', 3),
 ('ž', 2),
 ('E', 1),
 ('f', 1),
 ('g', 1),
 ('ć', 1),
 ('“', 1),
 ('Š', 1),
 ('D', 1),
 ('h', 1),
 ('č', 1),
 ('.', 1),
 ('\t', 1),
 ('„', 1),
 ('P', 1),
 ('c', 1),
 ('S', 1)]

In [7]:
# Create a template for all line vectors
zero_vector = OrderedDict((char, 0) for char in sorted(charset))

In [8]:
zero_vector

OrderedDict([('\t', 0),
             ('\n', 0),
             (' ', 0),
             ('!', 0),
             ('"', 0),
             ('#', 0),
             ('$', 0),
             ('%', 0),
             ('&', 0),
             ("'", 0),
             ('(', 0),
             (')', 0),
             ('*', 0),
             ('+', 0),
             (',', 0),
             ('-', 0),
             ('.', 0),
             ('/', 0),
             ('0', 0),
             ('1', 0),
             ('2', 0),
             ('3', 0),
             ('4', 0),
             ('5', 0),
             ('6', 0),
             ('7', 0),
             ('8', 0),
             ('9', 0),
             (':', 0),
             (';', 0),
             ('<', 0),
             ('=', 0),
             ('>', 0),
             ('?', 0),
             ('@', 0),
             ('A', 0),
             ('B', 0),
             ('C', 0),
             ('D', 0),
             ('E', 0),
             ('F', 0),
             ('G', 0),
             ('H', 0),
         

In [9]:
# Write a test loop to create character count vectors for the first 10 lines in the dataset
import copy 
line_vectors = []
with open(dataset, encoding='utf-8') as f:
    for i in range(10):
        line_vec = copy.copy(zero_vector)
        line = f.readline()
        line_char_count = Counter(line)
        for char, count in line_char_count.items():
            line_vec[char] = count/len(charset)
        line_vectors.append(line_vec)
print(line_vectors[0])

OrderedDict([('\t', 0.002717391304347826), ('\n', 0.002717391304347826), (' ', 0.05434782608695652), ('!', 0), ('"', 0), ('#', 0), ('$', 0), ('%', 0), ('&', 0), ("'", 0), ('(', 0), (')', 0), ('*', 0), ('+', 0), (',', 0.005434782608695652), ('-', 0.002717391304347826), ('.', 0.002717391304347826), ('/', 0), ('0', 0.002717391304347826), ('1', 0), ('2', 0), ('3', 0), ('4', 0.002717391304347826), ('5', 0.005434782608695652), ('6', 0), ('7', 0.002717391304347826), ('8', 0), ('9', 0), (':', 0), (';', 0), ('<', 0), ('=', 0), ('>', 0), ('?', 0), ('@', 0), ('A', 0), ('B', 0), ('C', 0), ('D', 0), ('E', 0), ('F', 0), ('G', 0), ('H', 0), ('I', 0.002717391304347826), ('J', 0), ('K', 0), ('L', 0), ('M', 0), ('N', 0), ('O', 0), ('P', 0), ('Q', 0), ('R', 0), ('S', 0), ('T', 0), ('U', 0), ('V', 0), ('W', 0), ('X', 0), ('Y', 0), ('Z', 0), ('[', 0), (']', 0), ('^', 0), ('_', 0), ('`', 0), ('a', 0.021739130434782608), ('b', 0.008152173913043478), ('c', 0.002717391304347826), ('d', 0.01358695652173913), ('

In [10]:
# These don't add up to one, as each entry is the count divided by the length of the whole set. 
# TODO: Check alternative sources to see if this is the standard
[sum([value for key, value in thisvec.items()]) for thisvec in line_vectors]

[0.3614130434782609,
 0.3260869565217391,
 0.6032608695652172,
 0.1929347826086957,
 0.5217391304347826,
 0.25271739130434784,
 0.47282608695652184,
 0.48641304347826086,
 0.2608695652173913,
 0.9456521739130436]

In [12]:
# First-pass loop based cosine similarity calculator, to test accuracy of vectorized version
import math
def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

In [11]:
# Vectorized cosine similarity calculator
import numpy as np
def np_cosine_sim(vec1, vec2):
    a = np.asarray([val for val in vec1.values()])
    b = np.asarray([val for val in vec2.values()])
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [13]:
# How similar are the first ten lines to one another?
for i in range(len(line_vectors)-1):
    print(cosine_sim(line_vectors[i], line_vectors[i+1]))

0.9167338222301734
0.9341823733148841
0.88379343247608
0.9225860611384142
0.9045655394449315
0.9325050340174421
0.8759725922698244
0.8557169633109853
0.9043723539174185


In [14]:
# Now compare to a Spanish line - first extract the line
spanish = "data/dataset-es-ES.csv"
df_spanish = pd.read_csv(spanish, encoding="utf-16", sep="\t")
df_spanish.head()
spanish_line = df_spanish.loc[0]['Text']
print(spanish_line)

Agregó que tambien informó a Valcárcel de la reunión mantenida la pasada semana en el ayuntamiento de Cartagena sobre este mismo asunto, en la que se estableció "un sistema y un método" para toda la tramitación tanto de los terrenos del polígono industrial de Los Camachos como los de Torreciega.


In [15]:
# Create character count vector for this line
spanish_line_vec = copy.copy(zero_vector)
line_char_count = Counter(spanish_line)
for char, count in line_char_count.items():
    spanish_line_vec[char] = count/len(charset)

In [16]:
# Now compare to a Farsi line - first extract the line
farsi = "data/dataset-fa-IR.csv"
df_farsi = pd.read_csv(farsi, encoding="utf-16", sep="\t")
farsi_line = df_farsi.loc[0]['Text']
print(farsi_line)

نیروهای نظامی سومالی و اتحادیه آفریقا با بیرون راندن  اسلامگرایان الشباب، کنترل شماری از شهرهای جنوبی و مرزی سومالی را در دست گرفتند.


In [17]:
# Create character count vector for this line
farsi_line_vec = copy.copy(zero_vector)
line_char_count = Counter(farsi_line)
for char, count in line_char_count.items():
    farsi_line_vec[char] = count/len(charset)

In [18]:
# Similarity of a Bosnian line and a Farsi line
np_cosine_sim(line_vectors[0], farsi_line_vec)

0.34868568127063615

In [19]:
# Similarity of a Bosnian line and Spanish line. Unfortunately very high, likely due to the quantity of zero-valued dimensions
np_cosine_sim(line_vectors[9], spanish_line_vec)

0.9172522418029946

In [20]:
# Similarity of a Spanish line and a Farsi line
np_cosine_sim(spanish_line_vec, farsi_line_vec)

0.3442868017896875

In [21]:
# Sanity check - similarity of one line to itself. Should be ~1.0
np_cosine_sim(spanish_line_vec, spanish_line_vec)

0.9999999999999999