# Proportion of Verb Usage Analysis

This notebook performs data preprocessing, cleaning, and regression analysis on verb usage data to analyze the proportion of verb tenses across subgenres and historical periods.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import re

## Data Cleaning Functions

In [None]:
def clean_string_value(value):
    '''
    Cleans a string by normalizing Unicode, removing special characters,
    and formatting it to lowercase with underscores.
    '''
    if pd.isnull(value):
        return value
    nfkd_form = unicodedata.normalize('NFKD', value)
    only_ascii = nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
    only_ascii = re.sub(r"[^\w\s-]", "", only_ascii)
    only_ascii = only_ascii.strip().replace(' ', '_').lower()
    return only_ascii

def clean_column_names(col_names):
    '''Applies clean_string_value to a list of column names.'''
    return [clean_string_value(name) for name in col_names]

## Load and Clean the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('/content/df_metadated_floflo_updated.csv', delimiter=',', encoding='utf-8')

# Clean column names
df.columns = clean_column_names(df.columns)
print("Cleaned Column Names:")
print(df.columns)

## Validate Required Columns

In [None]:
# Define required columns
required_columns = [
    'col_name', 'present', 'imparfait', 'passe_simple', 'passe_compose', 'futur', 'plus_que_parfait', 
    'date', 'canon', 'subgenre'
]

# Check for missing columns
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print("\nWarning: Missing columns:", missing_columns)
else:
    print("\nAll required columns are present.")

## Add Calculated Columns

In [None]:
# Calculate total verbs and proportions
df['total_verbes'] = df[['present', 'imparfait', 'passe_simple', 'passe_compose', 'futur', 'plus_que_parfait']].sum(axis=1)
df['proportion_present'] = df['present'] / df['total_verbes']
df['proportion_passe_simple'] = df['passe_simple'] / df['total_verbes']

## Assign Periods Based on Date

In [None]:
def assign_period_21(year):
    '''Assigns a time period to a year based on predefined ranges.'''
    if 1811 <= year <= 1831: return '1811_1831'
    elif 1832 <= year <= 1852: return '1832_1852'
    elif 1853 <= year <= 1873: return '1853_1873'
    elif 1874 <= year <= 1894: return '1874_1894'
    elif 1895 <= year <= 1915: return '1895_1915'
    elif 1916 <= year <= 1936: return '1916_1936'
    elif 1937 <= year <= 1957: return '1937_1957'
    elif 1958 <= year <= 1978: return '1958_1978'
    elif 1979 <= year <= 1999: return '1979_1999'
    elif 2000 <= year <= 2024: return '2000_2024'
    else: return np.nan

# Apply function and drop rows with invalid periods
df['period_21'] = df['date'].apply(assign_period_21)
df = df.dropna(subset=['period_21'])

## Regression Analysis

In [None]:
# Run OLS regression for proportion of 'present'
formula_reg1 = 'proportion_present ~ canon'
for col in pd.get_dummies(df['period_21'], drop_first=True):
    formula_reg1 += ' + ' + col

model_reg1 = smf.ols(formula=formula_reg1, data=df).fit()
print(model_reg1.summary())

## Visualize Regression Coefficients

In [None]:
params = model_reg1.params
conf = model_reg1.conf_int()
conf.columns = ['2.5%', '97.5%']
coef = pd.concat([params, conf], axis=1)

plt.figure(figsize=(12, 6))
plt.barh(coef.index, coef[0], xerr=[coef[0] - coef['2.5%'], coef['97.5%'] - coef[0]], capsize=4)
plt.axvline(x=0, color='black', linestyle='--')
plt.title('Regression Coefficients for Proportion of Present')
plt.show()