In [None]:
import numpy as np
import pandas as pd
from ada_config.config import CONFIG
import statsmodels.api as sm
import plotly.graph_objects as go

import random
import pandas as pd
from collections import deque
from sklearn.metrics import f1_score

import networkx as nx
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
np.bool = np.bool_
from datawig import SimpleImputer
import plotly.io as pio

random.seed(42)
np.random.seed(42)

In [None]:
df_movies = pd.read_csv(CONFIG["data_path"] / "MovieVerse.csv")

In [None]:
df_movies.head()

In [None]:
df_movies['budget'].isna().sum()

In [None]:
df_movies.drop(columns=['Wikipedia_movie_ID', 'Freebase_movie_ID', 'movie_name', 'movie_summary', 'tmdb_id', 'imdb_id', 'titleType', 'budget', 'movie_revenue'], inplace=True)

In [None]:
df_movies['adult'] = df_movies['adult'].astype(float)

In [None]:
df_movies['crew_popularity_max'] = df_movies[['Director_popularity', 'Writer_popularity', 'Producer_popularity']].max(axis=1)
df_movies.drop(columns=['Director_popularity', 'Writer_popularity', 'Producer_popularity'], inplace=True)

In [None]:
df_movies.describe()['crew_popularity_max']

In [None]:
df_movies.drop(columns=['star_4_popularity', 'star_5_popularity'], inplace=True)
df_movies['cast_popularity_avg'] = df_movies[['star_1_popularity', 'star_2_popularity', 'star_3_popularity']].mean(axis=1)
df_movies.drop(columns=['star_1_popularity', 'star_2_popularity', 'star_3_popularity'], inplace=True)

In [None]:
df_movies.describe()['cast_popularity_avg']

In [None]:
most_genres = ["Drama", "Comedy", "Thriller", "Romance", "Action", "Black-And-White"]

def genres_dummies(row):
    if pd.isna(row):
        return [None] * len(most_genres)
    row = [r.strip() for r in row.split(', ')]
    dummies = []
    for g in most_genres:
        if g in row:
            dummies.append(1)
        else:
            dummies.append(0)
    return dummies

df_movies[[f'is_{genre}' for genre in most_genres]] = df_movies['movie_genres'].apply(genres_dummies).apply(pd.Series)
df_movies.drop(columns='movie_genres', inplace=True)

In [None]:
df_movies.describe()[[f'is_{genre}' for genre in most_genres]]

In [None]:
df_movies['sentiment_score'] = df_movies[['sentiment_label', 'sentiment_score']].dropna().apply(lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 1 - x['sentiment_score'], axis=1)
df_movies.drop(columns='sentiment_label', inplace=True)

In [None]:
df_movies.describe()['sentiment_score']

In [None]:
df_movies['is_USA_movie'] = df_movies['movie_countries'].dropna().apply(lambda x: 'United States of America' in [country.strip() for country in  x.split(', ')])
df_movies['is_USA_movie'] = df_movies['is_USA_movie'].astype(float)
df_movies.drop(columns='movie_countries', inplace=True)

In [None]:
df_movies.describe()['is_USA_movie']

In [None]:
df_movies['is_en'] = df_movies['original_language'].dropna().apply(lambda x: x == 'en')
df_movies['is_en'] = df_movies['is_en'].astype(float)
df_movies.drop(columns='original_language', inplace=True)

In [None]:
df_movies.describe()['is_en']

In [None]:

df_movies['year_interval'] = pd.cut(df_movies['year'], bins=range(1880, 2031, 10), right=False,
                             labels=[f"{i}-{i+9}" for i in range(1880, 2030, 10)])
df_movies['year_interval'] = df_movies['year_interval'].astype(str)
df_movies['year_interval'] = df_movies[['year', 'year_interval']].apply(lambda x: '1888-1919' if x['year'] < 1920 else x['year_interval'], axis=1)
df_movies['year_interval'] = df_movies[['year', 'year_interval']].apply(lambda x: '2010-2021' if x['year'] >= 2010 else x['year_interval'], axis=1)
df_movies = pd.get_dummies(df_movies, columns=["year_interval"], drop_first=True)

df_movies.drop(columns=['year'], inplace=True)

In [None]:
cols = []
for col in df_movies.columns:
    if col.startswith('year_interval'):
        cols.append(col)
df_movies.loc[df_movies['year_interval_nan'], cols] = None
df_movies.drop(columns=['year_interval_nan'], inplace=True)

In [None]:
df_movies.describe()[[col for col in df_movies.columns if col.startswith('year_interval')]]


In [None]:
nan_df = df_movies.isna()

nan_corr = nan_df.corr()

plt.figure(figsize=(8, 6))
plt.matshow(nan_corr, cmap='coolwarm', fignum=1)
plt.colorbar()
plt.xticks(ticks=range(df_movies.shape[1]), labels=df_movies.columns, rotation=90)
plt.yticks(ticks=range(df_movies.shape[1]), labels=df_movies.columns)
plt.title("NaN Correlation Matrix", pad=20)
plt.show()

In [None]:
cols = list(df_movies.columns)
cols.remove('wikidata_id')
imputed_dict = {}
for col in cols:
    print(col, ':')
    c_cols = cols.copy()
    c_cols.remove(col)
    imputer = SimpleImputer(
        input_columns=c_cols,
        output_column=col
    )
    imputer.fit(df_movies)
    imputed_dict[col] = imputer.predict(df_movies)[col]

for col in cols:
    df_movies.loc[df_movies[col].isna(), col] = imputed_dict[col][df_movies[col].isna()]

In [None]:
df_movies.to_csv(CONFIG["data_path"] / "regression_data.csv", index=False)

Regression

In [None]:
df_movies = pd.read_csv(CONFIG["data_path"] / "regression_data.csv")
df_remakes_dataset = pd.read_csv(CONFIG["data_path"] / "remakes.csv")

In [None]:
df_movies['is_remake'] = df_movies["wikidata_id"].isin(df_remakes_dataset["remake_wikidata_id"])
df_movies['is_remake'] = df_movies['is_remake'].astype(float)
df_movies['has_remake'] = df_movies["wikidata_id"].isin(df_remakes_dataset["original_wikidata_id"])
df_movies['has_remake'] = df_movies['has_remake'].astype(float)
df_movies.drop(columns=['wikidata_id', 'adult'], inplace=True)

In [None]:
df_movies['is_before_1980'] = 1 - df_movies['year_interval_1990-1999'] - df_movies['year_interval_2000-2009'] - df_movies['year_interval_2010-2021'] - df_movies['year_interval_1980-1989']
df_movies.drop(columns=[col for col in df_movies.columns if col.startswith('year_interval')], inplace=True)

In [None]:
df_movies['log_revenue_budget_ratio'] = np.log(df_movies['adjusted_revenue']) - np.log(df_movies['adjusted_budget'])
df_movies.drop(columns=['adjusted_revenue'], inplace=True)

In [None]:
for col in df_movies.columns:
    df_movies[col] = df_movies[col].astype(float)

In [None]:

cols = list(df_movies.columns)
cat_cols = []
non_cat_cols = []
for col in cols:
    col = col.strip()
    if col.startswith('is') or col.startswith('year') or col in ['has_remake', 'adult']:
        cat_cols.append(col)
    else:
        non_cat_cols.append(col)

sc_ = StandardScaler()
df_movies.loc[:, non_cat_cols] = sc_.fit_transform(df_movies[non_cat_cols])
for col in non_cat_cols:
    df_movies.loc[df_movies[col].isna(), col] = 0
df_movies.dropna(inplace=True)


In [None]:


def plot_significance_plotly(model, path_to_save):
    coefficients = model.params
    conf_intervals = model.conf_int()
    conf_intervals.columns = ['lower', 'upper']

    coef_df = pd.concat([coefficients, conf_intervals], axis=1)
    coef_df.columns = ['coef', 'lower', 'upper']
    coef_df = coef_df[coef_df.index != 'const']

    coef_df['significant'] = (coef_df['lower'] > 0) | (coef_df['upper'] < 0)

    fig = go.Figure()

    for i, row in coef_df.iterrows():
        color = 'red' if row['significant'] else 'black'
        fig.add_trace(
            go.Scatter(
                x=[i],
                y=[row['coef']],
                mode='markers',
                marker=dict(size=10, color=color),
                name=f'{i} Coef'
            )
        )

        fig.add_trace(
            go.Scatter(
                x=[i, i],
                y=[row['lower'], row['upper']],
                mode='lines',
                line=dict(color='blue', width=2),
                showlegend=False
            )
        )

    fig.add_shape(
        type="line",
        x0=-0.5,
        x1=len(coef_df) - 0.5,
        y0=0,
        y1=0,
        line=dict(color="black", width=1, dash="dash")
    )

    fig.update_layout(
        title={
            'text': 'Coefficients and Confidence Intervals',
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title='Predictor Variables',
        yaxis_title='Coefficient Value',
        xaxis=dict(tickvals=list(range(len(coef_df))), ticktext=coef_df.index, tickangle=45),
        showlegend=False,
        template="plotly_white",
        autosize=True,
        height=600,
        width=800,
    )

    fig.add_annotation(
        xref="paper",
        yref="paper",
        x=1.15,
        y=1.05,
        text="<br><span style='color:red'>Red = Significant</span><br>Black = Not Significant",
        showarrow=False,
        font=dict(size=12),
        align="left",
        bordercolor="black",
        borderwidth=1,
        borderpad=5,
        bgcolor="white"
    )

    fig.show()

    pio.write_html(fig, path_to_save, auto_open=True, include_plotlyjs="cdn")

    return coef_df


In [None]:

def regress(df, has_remake=True):

    if has_remake:
        col_keep = 'has_remake'
        col_remove = 'is_remake'
    else:
        col_keep = 'is_remake'
        col_remove = 'has_remake'

    df = df[df[col_remove] == 0]

    X = df.drop(columns=['has_remake', 'is_remake'])
    y = df[col_keep]
    print('number of samples with positive y:', y.sum())

    X = X.astype(float)
    y = y.astype(float)
    X = sm.add_constant(X, has_constant='add')
    model = sm.Logit(y, X).fit()

    print('acc:', (model.predict(X) > 0.5).eq(y).mean())
    print('f1:', f1_score(y, model.predict(X) > 0.5))
    print(model.summary())
    return model

In [None]:
has_model = regress(df_movies, has_remake=True)
plot_significance_plotly(has_model, 'has_remake.html')

In [None]:
res_model = regress(df_movies, has_remake=False)
plot_significance_plotly(res_model, 'is_remake.html')

In [None]:

def plot_ate_or_histograms(results, output_file="ate_or_histograms.html"):

    columns = list(results.keys())
    try:
        ate_values = [results[col]['ATE'] for col in columns]
        or_values = [results[col]['OR'] for col in columns]
    except KeyError as e:
        raise ValueError(f"Missing expected key in results: {e}")

    if not ate_values or not or_values:
        raise ValueError("ATE or OR values are empty. Check the input results dictionary.")

    fig_ate = go.Figure()

    if ate_values:
        fig_ate.add_trace(go.Bar(
            x=columns,
            y=ate_values,
            name="ATE",
            marker=dict(color="#636EFA"),
            # text=[f"{val:.3f}" for val in ate_values],
            textposition="outside",
            showlegend=False,
        ))
        ate_std_err = [results[col]['ATE_std_ere'] * 1.96 for col in columns]
        fig_ate.add_trace(go.Scatter(
            x=columns,
            y=ate_values,
            mode='markers',
            marker=dict(size=10, color='red'),
            name="ATE 95% CI",
            error_y=dict(
                type='data',
                array=ate_std_err,
                visible=True
            ),
            showlegend=False,
        ))

    fig_ate.add_shape(
        type="line",
        x0=-0.5,
        x1=len(columns) - 0.5,
        y0=0.05,
        y1=0.05,
        line=dict(color="black", width=1, dash="dash")
    )

    fig_ate.add_shape(
        type="line",
        x0=-0.5,
        x1=len(columns) - 0.5,
        y0=-0.05,
        y1=-0.05,
        line=dict(color="black", width=1, dash="dash")
    )

    fig_ate.update_layout(
        title="ATE Metrics Across Columns",
        xaxis=dict(title="Columns", tickmode="array", tickvals=list(range(len(columns))), ticktext=columns),
        yaxis=dict(title="ATE Values"),
        template="plotly_white",
        autosize=True,
        height=600,
        width=800
    )

    fig_ate.show()
    pio.write_html(fig_ate, output_file.replace(".html", "_ate.html"), auto_open=True, include_plotlyjs="cdn", auto_play=False)

    fig_or = go.Figure()

    if or_values:
        fig_or.add_trace(go.Bar(
            x=columns,
            y=or_values,
            name="OR",
            marker=dict(color="#EF553B"),
            # text=[f"{val:.2f}" for val in or_values],
            textposition="outside"
        ))

    fig_or.update_layout(
        title="OR Metrics Across Columns",
        xaxis=dict(title="Columns", tickmode="array", tickvals=list(range(len(columns))), ticktext=columns),
        yaxis=dict(title="OR Values"),
        template="plotly_white",
        autosize=True,
        height=600,
        width=800
    )

    fig_or.show()
    pio.write_html(fig_or, output_file.replace(".html", "_or.html"), auto_open=True, include_plotlyjs="cdn", auto_play=False)

In [None]:

def check_each_col_treat(df, has_remake=True):
    if has_remake:
        col_keep = 'has_remake'
        col_remove = 'is_remake'
    else:
        col_keep = 'is_remake'
        col_remove = 'has_remake'

    df = df[df[col_remove] == 0].reset_index(drop=True)

    X = df.drop(columns=['has_remake', 'is_remake'])
    y = df[col_keep].astype('bool')

    result = {}
    print('number of samples with positive y:', y.sum())
    for treat_col in X.columns:
        # if 'sentiment' not in treat_col:
        #     continue
        result[treat_col] = {}
        print()
        print('##############', treat_col, '##############')
        MAX_MATCHING_THRESHOLD = y.std() / y.shape[0] ** 0.5
        if len(X[treat_col].value_counts()) == 2:
            thr = 0.5
        elif 'sentiment' in treat_col:
            thr = 0.0
        else:
            if 'budget' in treat_col:
                thr = X[treat_col].mean() + X[treat_col].std() * 2
            # elif 'runtime' in treat_col:
            #     thr = X[treat_col].mean()
            else:
                thr = X[treat_col].mean() + X[treat_col].std() * 1.45
        treatment = y
        covariates = X.drop(columns=treat_col)
        model = sm.Logit(treatment, sm.add_constant(covariates, has_constant='add')).fit()
        print('thr:', thr, 'MAX_MATCHING_THRESHOLD:', MAX_MATCHING_THRESHOLD)
        outcome = df[treat_col] > thr
        df[f'{treat_col}_propensity_score'] = model.predict(sm.add_constant(covariates, has_constant='add'))

        df[f'{treat_col}_outcome'] = outcome
        control_df = df[~treatment]
        treatment_df = df[treatment]
        G = nx.Graph()
        sorted_control_df = control_df.sort_values(by=f'{treat_col}_propensity_score', ascending=True).reset_index(drop=True)
        sorted_treatment_df = treatment_df.sort_values(by=f'{treat_col}_propensity_score', ascending=True).reset_index(drop=True)
        start_treatment_index = 0
        end_treatment_index = 0

        ind_dq = deque()
        score_dq = deque()
        edges = []
        for i, row in sorted_control_df.iterrows():
            while end_treatment_index < len(sorted_treatment_df) and abs(row[f'{treat_col}_propensity_score'] - sorted_treatment_df[f'{treat_col}_propensity_score'].iloc[end_treatment_index]) < MAX_MATCHING_THRESHOLD:
                ind_dq.append(sorted_treatment_df.index[end_treatment_index])
                score_dq.append(sorted_treatment_df[f'{treat_col}_propensity_score'].iloc[end_treatment_index])
                end_treatment_index += 1
            while start_treatment_index < end_treatment_index and abs(row[f'{treat_col}_propensity_score'] - sorted_treatment_df[f'{treat_col}_propensity_score'].iloc[start_treatment_index]) >= MAX_MATCHING_THRESHOLD:
                ind_dq.popleft()
                score_dq.popleft()
                start_treatment_index += 1
            i_score = row[f'{treat_col}_propensity_score']
            if thr == 0.5:
                # sen_thr = 2
                sen_thr = 5
            else:
                if has_remake:
                    sen_thr = 5
                    # sen_thr = 1.01
                else:
                    # sen_thr = 1.5
                    # sen_thr = 1.01
                    sen_thr = 5
            for j, score in zip(ind_dq, score_dq):
                if 1 / sen_thr <= (i_score / (1 - i_score)) / (score / (1 - score)) <= sen_thr:
                    edges.append((j, i + len(sorted_treatment_df)))
        random.shuffle(edges)
        G.add_edges_from(edges)
        nodes = list(sorted_treatment_df.index)
        random.shuffle(nodes)
        G.add_nodes_from(nodes, bipartite=0)
        nodes = [ind_ + len(sorted_treatment_df) for ind_ in range(len(sorted_control_df))]
        random.shuffle(nodes)
        G.add_nodes_from(nodes, bipartite=1)

        matching = nx.bipartite.maximum_matching(G, top_nodes=list(sorted_treatment_df.index))
        print(f'{treat_col} matched:', len(matching) // 2)
        pairs = [[matching[i] - len(sorted_treatment_df), i] for i in range(len(sorted_treatment_df)) if i in matching]

        pairs = np.array(pairs)
        y_control = sorted_control_df.loc[pairs[:, 0], f'{treat_col}_outcome'].values
        y_treatment = sorted_treatment_df.loc[pairs[:, 1], f'{treat_col}_outcome'].values
        ATE_arr = np.array(y_treatment, dtype=float) - np.array(y_control, dtype=float)
        treat_socre = sorted_treatment_df.loc[pairs[:, 1], f'{treat_col}_propensity_score'].values
        control_score = sorted_control_df.loc[pairs[:, 0], f'{treat_col}_propensity_score'].values
        OR_arr = treat_socre / (1 - treat_socre) / (control_score / (1 - control_score))
        print(f'{treat_col} matching ATE:', ATE_arr.mean())
        OR = np.maximum(1 / OR_arr.min(), OR_arr.max())
        print(f'{treat_col} matching OR:', OR)
        result[treat_col]['ATE'] = ATE_arr.mean()
        result[treat_col]['OR'] = OR
        result[treat_col]['ATE_std_ere'] = ATE_arr.std() / (len(ATE_arr) ** 0.5)
    return result

In [None]:
has_res = check_each_col_treat(df_movies, has_remake=True)
plot_ate_or_histograms(has_res, 'casual_has_remake.html')

In [None]:
is_res = check_each_col_treat(df_movies, has_remake=False)
plot_ate_or_histograms(is_res, 'casual_is_remake.html')