In [1]:
import itertools
import numpy as np
import datetime
import os
import pandas as pd
import time
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import logging
from gensim.test.utils import datapath
from gensim import models
from gensim.models import CoherenceModel
import plotly.express as px

In [2]:
gs_path = r"C:\Users\noabi\OneDrive\שולחן העבודה\university\Year 3\project\lda model analysis\1694597770_grid_search.csv"

In [3]:
gs_df = pd.read_csv(gs_path)

In [4]:
topics = gs_df.num_topics.unique()

In [5]:
gs_df.head()

Unnamed: 0.1,Unnamed: 0,alpha,eta,passes,num_topics,no_below_int,no_above_percent,chunksize,iterations,eval_every,cv_score,u_mass_score,c_uci_score,identifier
0,0,0.1,0.1,5,25,200,0.7,2000,50,,0.516125,-0.942575,0.490382,1693237937
1,1,0.1,0.1,5,30,200,0.7,2000,50,,0.508231,-1.004144,0.491161,1693238245
2,2,0.1,0.1,5,35,200,0.7,2000,50,,0.516327,-0.996208,0.532862,1693238552
3,3,0.1,0.2,5,25,200,0.7,2000,50,,0.513736,-0.954181,0.475892,1693238850
4,4,0.1,0.2,5,30,200,0.7,2000,50,,0.503018,-0.987517,0.415913,1693239119


In [38]:
metric = "u_mass_score"
metric_name = "Umass-Coherence"

Following graphs shows cv coherence score vs eta vs alpha per number of topics

In [39]:
for topic in topics:
    fig = px.scatter_3d(gs_df[gs_df["num_topics"]==topic], x='alpha', y='eta', z=metric, color =metric,
                        labels = {metric : metric_name, "alpha" : "Alpha", "eta" : "Eta"},
                       title = f"Alpha VS Eta VS {metric_name}, topics: {topic}")
    fig.show()

The following graphs highlights top 0.9 percentile models according to cv coherence score.

In [48]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=2)

for topic in topics:
    df = gs_df[gs_df["num_topics"]==topic]
    threshold = df[metric].quantile(q=0.99)
    high_score_mask = df[metric] > threshold
    x = df.alpha
    y = df.eta
    high_x = x[high_score_mask]
    high_y = y[high_score_mask]
    fig = px.scatter(df, x='alpha', y='eta', color ="u_mass_score",
                  title = f"Parameter Space VS {metric_name}, Topics Num: {topic}",
                  labels = {"eta" : "Eta", "alpha" : "Alpha", metric : metric_name})
    fig.add_traces(px.scatter(x=high_x, y=high_y).update_traces(marker_size=10, marker_color="yellow").data)
    
    fig.show()

In [44]:
for topic in topics:
    fig = px.scatter(gs_df[gs_df["num_topics"]==topic], x='alpha', y=metric, trendline = "ols", title = f"Topics:{topic}")
    fig.show()
    results = px.get_trendline_results(fig)
    print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     14.58
Date:                Sun, 24 Sep 2023   Prob (F-statistic):           0.000140
Time:                        13:47:09   Log-Likelihood:                 3321.7
No. Observations:                1521   AIC:                            -6639.
Df Residuals:                    1519   BIC:                            -6629.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9741      0.001   -683.330      0.0

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.503
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.114
Time:                        13:47:10   Log-Likelihood:                 3428.7
No. Observations:                1521   AIC:                            -6853.
Df Residuals:                    1519   BIC:                            -6843.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0004      0.001   -752.949      0.0

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     7.761
Date:                Sun, 24 Sep 2023   Prob (F-statistic):            0.00541
Time:                        13:47:10   Log-Likelihood:                 3489.6
No. Observations:                1521   AIC:                            -6975.
Df Residuals:                    1519   BIC:                            -6965.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0168      0.001   -796.516      0.0

In [35]:
for topic in topics:
    fig = px.scatter(gs_df[gs_df["num_topics"]==topic], x='eta', y=metric, trendline = "ols")
    fig.show()
    results = px.get_trendline_results(fig)
    print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.862
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.173
Time:                        13:44:50   Log-Likelihood:                 4875.2
No. Observations:                1521   AIC:                            -9746.
Df Residuals:                    1519   BIC:                            -9736.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5047      0.001    983.167      0.0

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5217
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.470
Time:                        13:44:50   Log-Likelihood:                 5002.1
No. Observations:                1521   AIC:                        -1.000e+04
Df Residuals:                    1519   BIC:                            -9990.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5091      0.000   1078.157      0.0

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.582
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.209
Time:                        13:44:50   Log-Likelihood:                 4978.6
No. Observations:                1521   AIC:                            -9953.
Df Residuals:                    1519   BIC:                            -9943.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5124      0.000   1068.352      0.0

In [29]:
fig = px.scatter(gs_df, x='eta', y=metric, trendline = "ols", title = f"{metric_name} Score VS Eta", labels = {metric : metric_name, "eta": "Eta"})
fig.show()
results = px.get_trendline_results(fig)
print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.555
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.212
Time:                        13:43:43   Log-Likelihood:                 9591.3
No. Observations:                4563   AIC:                        -1.918e+04
Df Residuals:                    4561   BIC:                        -1.917e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9973      0.001  -1116.931      0.0

In [30]:
fig = px.scatter(gs_df, x='alpha', y=metric, trendline = "ols", title = f"{metric_name} VS Alpha", labels = {metric : metric_name, "alpha": "Alpha"})
fig.show()
results = px.get_trendline_results(fig)
print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.218
Date:                Sun, 24 Sep 2023   Prob (F-statistic):              0.136
Time:                        13:43:51   Log-Likelihood:                 9591.6
No. Observations:                4563   AIC:                        -1.918e+04
Df Residuals:                    4561   BIC:                        -1.917e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9971      0.001  -1116.801      0.0

In [53]:
fig = px.scatter(gs_df, x = "cv_score", y = "u_mass_score", 
           labels = {"u_mass_score" : "Umass", "cv_score" : "Cv"},
          title = "Cv VS Umass scores",
          trendline = "ols")
fig.show()
results = px.get_trendline_results(fig)
print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     475.6
Date:                Tue, 26 Sep 2023   Prob (F-statistic):          2.24e-100
Time:                        17:31:42   Log-Likelihood:                 9816.8
No. Observations:                4563   AIC:                        -1.963e+04
Df Residuals:                    4561   BIC:                        -1.962e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5228      0.022    -23.975      0.0

In [56]:
fig = px.scatter(gs_df, x = "cv_score", y = "u_mass_score", 
           labels = {"u_mass_score" : "Umass", "cv_score" : "Cv"},
          title = "Cv VS Umass scores",
          trendline = "ols")
fig.show()
results = px.get_trendline_results(fig)
print(results.px_fit_results.iloc[0].summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.096
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     487.0
Date:                Tue, 26 Sep 2023   Prob (F-statistic):          1.25e-102
Time:                        17:33:55   Log-Likelihood:                 9822.0
No. Observations:                4563   AIC:                        -1.964e+04
Df Residuals:                    4561   BIC:                        -1.963e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8858      0.005   -173.300      0.0