In [9]:

import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import Counter
from google.colab import files

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

print("Please upload your three CSV files (ish, like, and esque)...")
uploaded = files.upload()

ish_df = pd.read_csv('/content/ish_words.csv', header=None, names=['word'])
like_df = pd.read_csv('/content/like_words.csv', header=None, names=['word'])
esque_df = pd.read_csv('/content/esque_words.csv', header=None, names=['word'])

def remove_suffix_any(word, suffixes):
    for suffix in suffixes:
        if word.endswith(suffix):
            base = word[:-len(suffix)]
            if base.endswith('-'):
                base = base[:-1]
            return base
    return word

def guess_from_wordnet(base_word):
    synsets = wn.synsets(base_word)
    if not synsets:
        return None
    most_common = synsets[0]
    pos = most_common.pos()
    if pos == 'n':
        return 'NOUN'
    elif pos == 'v':
        return 'VERB'
    elif pos == 'a' or pos == 's':
        return 'ADJ'
    elif pos == 'r':
        return 'ADV'
    else:
        return 'OTHER'

def penn_to_universal(penn_tag):
    if penn_tag.startswith('NN'):
        return 'NOUN'
    elif penn_tag.startswith('VB'):
        return 'VERB'
    elif penn_tag.startswith('JJ'):
        return 'ADJ'
    elif penn_tag.startswith('RB'):
        return 'ADV'
    else:
        return 'OTHER'

def get_base_pos_improved(word, suffixes):
    base_word = remove_suffix_any(word, suffixes)
    guess = guess_from_wordnet(base_word)
    if guess:
        return guess
    lemma = lemmatizer.lemmatize(base_word)
    guess_lemma = guess_from_wordnet(lemma)
    if guess_lemma:
        return guess_lemma
    penn_tag = pos_tag([base_word])[0][1]
    return penn_to_universal(penn_tag)

def classify_base_pos(df, suffixes):
    words = df['word'].dropna().str.lower().tolist()
    pos_tags = [get_base_pos_improved(w, suffixes) for w in words]
    counts = Counter(pos_tags)
    return counts, pos_tags, words

ish_suffixes = ['-ish', 'ish']
like_suffixes = ['-like', 'like']
esque_suffixes = ['-esque', 'esque']

ish_counts, ish_base_pos, ish_words = classify_base_pos(ish_df, ish_suffixes)
like_counts, like_base_pos, like_words = classify_base_pos(like_df, like_suffixes)
esque_counts, esque_base_pos, esque_words = classify_base_pos(esque_df, esque_suffixes)

def normalize_counts(counts, total):
    return {pos: (count / total) * 100 for pos, count in counts.items()}

ish_percentages = normalize_counts(ish_counts, len(ish_words)) if ish_words else {}
like_percentages = normalize_counts(like_counts, len(like_words)) if like_words else {}
esque_percentages = normalize_counts(esque_counts, len(esque_words)) if esque_words else {}

print(f"Suffix -ish base POS counts: {ish_counts}")
print(f"Suffix -ish percentages: {ish_percentages}\n")

print(f"Suffix -like base POS counts: {like_counts}")
print(f"Suffix -like percentages: {like_percentages}\n")

print(f"Suffix -esque base POS counts: {esque_counts}")
print(f"Suffix -esque percentages: {esque_percentages}\n")

noun_percent_ish = ish_percentages.get('NOUN', 0)
noun_percent_like = like_percentages.get('NOUN', 0)
noun_percent_esque = esque_percentages.get('NOUN', 0)

most_productive_suffix = max(
    [('ish', noun_percent_ish), ('like', noun_percent_like), ('esque', noun_percent_esque)],
    key=lambda x: x[1]
)
print(f"\nThe suffix '{most_productive_suffix[0]}' attaches most frequently to nouns by percentage ({most_productive_suffix[1]:.2f}%).")

pd.DataFrame({'word': ish_words, 'base_pos': ish_base_pos}).to_csv('ish_base_pos.csv', index=False)
pd.DataFrame({'word': like_words, 'base_pos': like_base_pos}).to_csv('like_base_pos.csv', index=False)
pd.DataFrame({'word': esque_words, 'base_pos': esque_base_pos}).to_csv('esque_base_pos.csv', index=False)

print("Files exported: ish_base_pos.csv, like_base_pos.csv, and esque_base_pos.csv")

Please upload your three CSV files (ish, like, and esque)...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saving esque_words.csv to esque_words (4).csv
Saving ish_words.csv to ish_words (4).csv
Saving like_words.csv to like_words (4).csv
Suffix -ish base POS counts: Counter({'NOUN': 874, 'ADJ': 66, 'OTHER': 63})
Suffix -ish percentages: {'NOUN': 87.13858424725822, 'OTHER': 6.281156530408774, 'ADJ': 6.580259222333002}

Suffix -like base POS counts: Counter({'NOUN': 983, 'ADJ': 13, 'OTHER': 7})
Suffix -like percentages: {'NOUN': 98.00598205383848, 'OTHER': 0.6979062811565304, 'ADJ': 1.296111665004985}

Suffix -esque base POS counts: Counter({'NOUN': 168, 'OTHER': 4, 'ADJ': 3})
Suffix -esque percentages: {'NOUN': 96.0, 'OTHER': 2.2857142857142856, 'ADJ': 1.7142857142857144}


The suffix 'like' attaches most frequently to nouns by percentage (98.01%).
Files exported: ish_base_pos.csv, like_base_pos.csv, and esque_base_pos.csv
