In [2]:
import pandas as pd
import gensim

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [4]:
# Load data
datafile = "moviedata.csv"
movies = pd.read_csv(datafile)

In [5]:
# Preprocess text
movies['text'] = movies['text'].apply(gensim.utils.simple_preprocess)


In [6]:
# Define word lists
competence_words = ['precocious', 'resourceful', 'inquisitive', 'sagacious', 'inventive', 'astute', 'adaptable', 'reflective', 'discerning', 'intuitive', 'inquiring', 'judicious', 'analytical', 'luminous', 'venerable', 'imaginative', 'shrewd', 'thoughtful', 'sage',
'smart', 'ingenious', 'clever', 'brilliant', 'logical', 'intelligent', 'apt', 'genius', 'wise']  
physical_appearance_words = ['alluring', 'voluptuous', 'blushing', 'homely', 'plump', 'sensual', 'gorgeous', 'slim', 'bald', 'athletic', 'fashionable', 'stout', 'ugly', 'muscular', 'slender', 'feeble', 'handsome', 'healthy', 'attractive', 'fat', 'weak', 'thin', 'pretty',
'beautiful', 'strong']  


In [7]:
model_files = [
    "word2vec-movies_1921_1930-text.model",
    "word2vec-movies_1931_1940-text.model",
    "word2vec-movies_1941_1950-text.model",
    "word2vec-movies_1951_1960-text.model",
    "word2vec-movies_1961_1970-text.model",
    "word2vec-movies_1971_1980-text.model",
    "word2vec-movies_1981_1990-text.model",
    "word2vec-movies_1991_2000-text.model",
    "word2vec-movies_2001_2010-text.model"
]

In [8]:
models = [gensim.models.Word2Vec.load(model_file) for model_file in model_files]


In [9]:
# Function to compute embedding bias for a word in a specific model
def compute_embedding_bias(model, word):
    # Check if the word exists in the vocabulary of the model
    if word not in model.wv:
        print(f"Word '{word}' not found in the vocabulary.")
        return None

    # Calculate the average vector for the word
    word_vector = model.wv[word]

    # Initialize variables to store distances from the word vector to gender-specific words
    distance_to_men = []
    distance_to_women = []

    # Define gender-specific words
    gender_words = {'man', 'men', 'woman', 'women', 'male', 'female'}  # You can extend this list

    # Calculate distances from the word vector to gender-specific words
    for gender_word in gender_words:
        if gender_word in model.wv:
            gender_word_vector = model.wv[gender_word]
            # Calculate cosine similarity between the word vector and gender-specific word vectors
            similarity = np.dot(word_vector, gender_word_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(gender_word_vector))
            if 'man' in gender_word or 'male' in gender_word:
                distance_to_men.append(1 - similarity)  # 1 minus similarity gives distance
            else:
                distance_to_women.append(1 - similarity)

    # Calculate average distances to men and women
    avg_distance_to_men = np.mean(distance_to_men)
    avg_distance_to_women = np.mean(distance_to_women)

    # Calculate embedding bias as the difference between distances to men and women
    embedding_bias = avg_distance_to_women - avg_distance_to_men

    return embedding_bias


In [10]:
# Function to compute embedding bias for a word over time
def compute_bias_over_time(word, models):
    biases = []
    for model in models:
        # Compute embedding bias for the word in each model
        # You need to implement how to compute the bias based on your logic
        bias = compute_embedding_bias(model, word)
        biases.append(bias)
    return biases

In [11]:
# Compute biases for competence words over time
competence_biases_over_time = {}
for word in competence_words:
    biases = compute_bias_over_time(word, models)
    competence_biases_over_time[word] = biases

Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'precocious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'sagacious' not found in the vocabulary.
Word 'inventive' not found in the vocabulary.
Word 'inventive' not found in the vocabulary.
Word 'inventive' not found in the vocabulary.
Word 'inventive' not foun

```Most of the competence words are not present in the corpus (movie lines) used to train the Word2Vec models.```

In [12]:
# Compute biases for physical appearance words over time
physical_appearance_biases_over_time = {}
for word in physical_appearance_words:
    biases = compute_bias_over_time(word, models)
    physical_appearance_biases_over_time[word] = biases

In [13]:
# Assuming you have already computed the biases over time for the competence and physical appearance words
# competence_biases_over_time and physical_appearance_biases_over_time are dictionaries containing biases for each word over time

# Extract the years from 1960 to 1990
years = range(1960, 1991)

# Prepare the data for competence words regression
competence_regression_data = {'Year': [], 'Word': [], 'Bias': []}

for word, biases in competence_biases_over_time.items():
    for year, bias in zip(years, biases):
        competence_regression_data['Year'].append(year)
        competence_regression_data['Word'].append(word)
        competence_regression_data['Bias'].append(bias)

competence_df = pd.DataFrame(competence_regression_data)

In [20]:
competence_df

Unnamed: 0,Year,Word,Bias,Constant
0,1960,precocious,,1
1,1961,precocious,,1
2,1962,precocious,,1
3,1963,precocious,,1
4,1964,precocious,,1
...,...,...,...,...
247,1964,wise,0.230830,1
248,1965,wise,0.212473,1
249,1966,wise,0.241944,1
250,1967,wise,0.186649,1


In [14]:
# Add a constant term for the regression
competence_df['Constant'] = 1

# Perform regression for competence words
competence_model = sm.OLS(competence_df['Bias'], competence_df[['Constant', 'Year']])
competence_results = competence_model.fit()
print("Regression results for competence words:")
print(competence_results.summary())

Regression results for competence words:
                            OLS Regression Results                            
Dep. Variable:                   Bias   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 11 Feb 2024   Prob (F-statistic):                nan
Time:                        22:33:22   Log-Likelihood:                    nan
No. Observations:                 252   AIC:                               nan
Df Residuals:                     250   BIC:                               nan
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Constant   

In [15]:
#  Prepare the data for physical appearance words regression
physical_appearance_regression_data = {'Year': [], 'Word': [], 'Bias': []}

for word, biases in physical_appearance_biases_over_time.items():
    for year, bias in zip(years, biases):
        physical_appearance_regression_data['Year'].append(year)
        physical_appearance_regression_data['Word'].append(word)
        physical_appearance_regression_data['Bias'].append(bias)

physical_appearance_df = pd.DataFrame(physical_appearance_regression_data)


In [21]:
physical_appearance_df

Unnamed: 0,Year,Word,Bias,Constant
0,1960,alluring,0.094286,1
1,1961,alluring,0.228762,1
2,1962,alluring,0.154791,1
3,1963,alluring,0.216287,1
4,1964,alluring,0.074552,1
...,...,...,...,...
220,1964,strong,0.014492,1
221,1965,strong,0.025215,1
222,1966,strong,0.002842,1
223,1967,strong,0.005537,1


In [16]:
# Add a constant term for the regression
physical_appearance_df['Constant'] = 1

# Perform regression for physical appearance words
physical_appearance_model = sm.OLS(physical_appearance_df['Bias'], physical_appearance_df[['Constant', 'Year']])
physical_appearance_results = physical_appearance_model.fit()
print("\nRegression results for physical appearance words:")
print(physical_appearance_results.summary())


Regression results for physical appearance words:
                            OLS Regression Results                            
Dep. Variable:                   Bias   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.252
Date:                Sun, 11 Feb 2024   Prob (F-statistic):              0.264
Time:                        22:33:22   Log-Likelihood:                 258.13
No. Observations:                 225   AIC:                            -512.3
Df Residuals:                     223   BIC:                            -505.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
C

- R-squared: The R-squared value indicates the proportion of variance in the dependent variable (embedding bias) explained by the independent variable (years). In this case, the R-squared value is 0.006, indicating that ```only a small proportion of the variance in embedding bias is explained by the years.```

- Adjusted R-squared: The adjusted R-squared value adjusts the R-squared value for the number of predictors in the model. It is similar to R-squared but penalizes the addition of unnecessary predictors. In this case, the adjusted R-squared value is 0.001, which is very close to zero, suggesting that the independent variable (years) may not be a good predictor of embedding bias for physical appearance words.

- F-statistic and Prob (F-statistic): The F-statistic tests the overall significance of the regression model. The associated p-value (Prob (F-statistic)) indicates the probability of obtaining an F-statistic as extreme as the one observed if the null hypothesis (that all regression coefficients are zero) is true. In this case, ```the p-value is 0.264, which is greater than the typical significance level of 0.05. Therefore, we fail to reject the null hypothesis, suggesting that the regression model may not be statistically significant.```

- Coefficients: The coefficients represent the estimated effects of the independent variable (years) on the dependent variable (embedding bias). In this case, the coefficient for the year variable is -0.0022, indicating a slight negative trend over time. However, the coefficient is not statistically significant at the 0.05 level, as indicated by the p-value (P>|t|) of 0.264.

Overall, based on these regression results, it appears that ```there is little to no significant relationship between the years and the embedding bias for physical appearance words.``` The model does not provide strong evidence to suggest that the embedding bias for physical appearance words has changed significantly over time from the 1960s to 1990s.





