In [None]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 6.6 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [None]:
import pandas as pd
import numpy as np
import os
import glob
import re
from unidecode import unidecode
import nltk
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import matplotlib as mpl
from sklearn.feature_extraction.text import CountVectorizer as CV

# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
incerto_dir = '/content/drive/MyDrive/incerto-autore/'
poems_dir = os.path.join(incerto_dir, 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')

poems_files = glob.glob(os.path.join(poems_dir, "*.txt"))

In [None]:
poems_d = {'poem_label':[], 'author': [], 'poem':[]}

for f in poems_files:
  author_name = f.split('/')[-1].split('_')[0]
  poems_d['author'].append(author_name)

  poem_l = []
  with open(f, 'r', encoding='utf-8-sig') as f:
      for line in f:
        line = line.strip().lower()
        line = re.sub(r'[^\w\s]', ' ', line)
        line = unidecode(line, 'utf-8')
        poem_l.append(line)

  poem = ' '.join(poem_l)
  poems_d['poem'].append(poem)

  poem_label = poem[:5]
  poems_d['poem_label'].append(poem_label)

In [None]:
poems_df = pd.DataFrame(poems_d)
poems_df[-10:]

In [None]:
# def cleaning_docs(df, df_ids, df_docs):
#     for index, row in df.iterrows():
#         doc_id = row[df_ids]
#         doc = row[df_docs]
#         clean_doc = []
#         lowercase = doc.lower()
#         tokenized = nltk.word_tokenize(lowercase)  # list of tokens
#         for token in tokenized:
#             token = re.sub(r'[^\w\s\d]', '', token)  # remove punctuation from token
#             #if token and token not in stopwords:  # if token is not empty and is not in stopwords
#             token = lemmatizer.lemmatize(token)  # lemmatize token
#             clean_doc.append(token)
#             vocab.add(token)
#         n_tokxdoc.append(len(clean_doc))
#         clean_doc = ' '.join(clean_doc)
#         if clean_doc:
#             doc_ids.append(doc_id)
#             clean_docs.append(clean_doc)

#     print("Number of Documents: {}".format(len(clean_docs)))
#     print("Mean Number of Words per Document: {}".format(np.mean(n_tokxdoc)))
#     print("Vocabulary Size: {}".format(len(list(vocab))))

#     return doc_ids, clean_docs

In [None]:
# Monroe's fightin' words calculation
# Based on Jack Hessel's and Xanda Schofield's fightin words implementations

def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''
    return_string = ''.join([ch for ch in in_string if ord(ch) < 128]).lower()
    return_string = ' '.join(return_string.split())
    return return_string

In [None]:
def bayes_compare_language(lang1, lang2, output_path, ngram=1, prior=.01, cv=None, sig_val=2.573):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''

    l1 = lang_d[lang1]
    l2 = lang_d[lang2]

    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()

    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]

    if cv is None:
        cv = CV(
            decode_error = 'ignore',
            min_df = .1,
            max_df = .7,
            ngram_range=(1,ngram),
            binary = False,
            max_features = 1500)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v: k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]


    # plotting z scores and frequencies
    x_vals = count_matrix.sum(axis=0)
    y_vals = z_scores
    sizes = abs(z_scores) * 2
    neg_color, pos_color, insig_color = (colors_d[lang1], colors_d[lang2], '#d8d8d8')
    colors = []
    annots = []
    for i, y in enumerate(y_vals):
        if y > sig_val:
            colors.append(pos_color)
            annots.append(index_to_term[i])
        elif y < -sig_val:
            colors.append(neg_color)
            annots.append(index_to_term[i])
        else:
            colors.append(insig_color)
            annots.append(None)


    fig, ax = plt.subplots(figsize=(15,10))
    ax.scatter(x_vals, y_vals, c=colors, linewidth=0, alpha = 0.7)

    for i, annot in enumerate(annots):
        if annot is not None:
            if (np.abs(y_vals[i]) > 2.573):
                ax.annotate(annot, (x_vals[i], y_vals[i]), color='black', alpha = 1, fontsize=12)

    ax.set_xscale('log')
    plt.xlabel("Word Frequency")
    plt.ylabel("z-score (log scale)")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.title(f'{lang1} vs {lang2}')

    legend_elements = [Line2D([0], [0], marker='o', color=neg_color, label=lang1, markersize=8, alpha=0.8, linestyle="None"),
                       Line2D([0], [0], marker='o', color=pos_color, label=lang2, markersize=8, alpha=0.8, linestyle="None")]
    ax.legend(handles=legend_elements)

    plt.savefig(os.path.join(output_path, f'{lang1} vs {lang2}.png'), dpi = 300)

    print(f'Most distinctive words for {lang1}')
    for i in return_list[:10]:
      print(i)
    print(f'Most distinctive words for {lang2}')
    for i in return_list[-10:]:
      print(i)

    return return_list, cv.vocabulary_

In [None]:
colors_d = {'Franco': 'tomato',
            'unknown': 'plum',
            'other': '#053430',
            'Franco + unknown': 'palevioletred'}

cond = (poems_df['author'] != 'unknown') & (poems_df['author'] != 'Franco')

lang_d = {'Franco': poems_df.loc[poems_df['author'] == 'Franco', 'poem'].tolist(),
          'unknown': poems_df.loc[poems_df['author'] == 'unknown', 'poem'].tolist(),
          'other' : poems_df.loc[cond, 'poem'].tolist()
          }

In [None]:
vectorizer = CV(decode_error = 'ignore', max_df = 0.7, min_df = 0.1, binary = False)

output_list, vocabulary = bayes_compare_language('Franco', 'other', figures_dir, cv=vectorizer)


In [None]:
vectorizer = CV(decode_error = 'ignore', max_df = 0.95, min_df = 0.05, binary = False)

output_list, vocabulary = bayes_compare_language(unknown, other, figures_dir, cv=vectorizer)

In [None]:
vectorizer = CV(decode_error = 'ignore', max_df = 0.95, min_df = 0.05, binary = False)

output_list, vocabulary = bayes_compare_language(franco, unknown, figures_dir, cv=vectorizer)

In [None]:
vectorizer = CV(decode_error = 'ignore', max_df = 0.7, min_df = 0.1, binary = False)

output_list, vocabulary = bayes_compare_language(franco+unknown, other, figures_dir, cv=vectorizer)

In [None]:
for i in output_list[:10] + output_list[-10:]:
  print(i)

('bel', -5.234728057618968)
('tuo', -5.191988343594773)
('occhi', -5.132252565753478)
('tua', -4.6647916757456995)
('onde', -4.147297522499295)
('te', -3.9282323718747176)
('sol', -3.8562925507206884)
('morte', -3.8459311737603445)
('luce', -3.8064597606893398)
('volto', -3.7507474879337654)
('con', 3.3811552858961984)
('senza', 3.637003383629772)
('quanto', 3.739699962522934)
('ancor', 4.3414427735870005)
('vostro', 4.659139367740917)
('da', 4.960107606189678)
('non', 5.269183151572305)
('vi', 5.336727414721941)
('mio', 5.4426905357980235)
('voi', 5.717610157839146)
