## Importation données

In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
path = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données/metrics/2012_H01L_claims_vs_claims_background_Metrics.csv"

In [2]:
df = pd.read_csv(path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données/metrics/2012_H01L_claims_vs_claims_background_Metrics.csv'

In [57]:
df

Unnamed: 0,application_number,label,new_ratio,new_bin,uniq_ratio,uniq_bin,diff_ratio,diff_bin,neighboroud_distance,surpDiv_ratio,surpDiv_bin
0,13482313,0,0.000168,0,0.492441,0,0.46,0,0.379939,0.116744,1
1,13674006,1,0.000341,0,0.519067,0,0.95,1,0.379939,0.170545,1
2,13500084,0,0.000242,0,0.541795,1,0.76,0,0.379939,0.096064,1
3,13344492,1,0.000229,0,0.494423,0,0.60,0,0.379939,0.123130,1
4,13609643,1,0.000246,0,0.476105,0,0.00,0,0.379939,0.133284,1
...,...,...,...,...,...,...,...,...,...,...,...
19193,13512955,0,0.000238,0,0.508650,0,0.98,1,0.379939,0.167717,1
19194,13607276,1,0.000320,0,0.493389,0,0.98,1,0.379939,0.149307,1
19195,13380703,0,0.000371,0,0.581730,1,0.99,1,0.379939,0.189979,1
19196,13680773,1,0.000238,0,0.550384,1,1.00,1,0.379939,0.142447,1


### Corrélations labels et ratios

In [2]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr

def correl_labelScores(df):
    # Select columns ending with 'ratio'
    ratio_columns = [col for col in df.columns if col.endswith("ratio")]

    # Compute Pearson and Spearman correlations
    pearson_corr = df[["label"] + ratio_columns].corr(method="pearson")["label"].drop("label")
    spearman_corr = df[["label"] + ratio_columns].corr(method="spearman")["label"].drop("label")

    # Calculate p-values
    pearson_pvals = {col: pearsonr(df["label"], df[col])[1] for col in ratio_columns}
    spearman_pvals = {col: spearmanr(df["label"], df[col])[1] for col in ratio_columns}

    # Formatting the result as a DataFrame
    results = pd.DataFrame({
        0: ["Correlation label and metric", "pearson correlation (p-value)", "spearman correlation (p-value)"],
        1: ["newness", f"{pearson_corr['new_ratio']:.3f} ({pearson_pvals['new_ratio']:.3f})", f"{spearman_corr['new_ratio']:.3f} ({spearman_pvals['new_ratio']:.3f})"],
        2: ["uniqueness", f"{pearson_corr['uniq_ratio']:.3f} ({pearson_pvals['uniq_ratio']:.3f})", f"{spearman_corr['uniq_ratio']:.3f} ({spearman_pvals['uniq_ratio']:.3f})"],
        3: ["difference", f"{pearson_corr['diff_ratio']:.3f} ({pearson_pvals['diff_ratio']:.3f})", f"{spearman_corr['diff_ratio']:.3f} ({spearman_pvals['diff_ratio']:.3f})"],
        4: ["surprise divergence", f"{pearson_corr['surpDiv_ratio']:.3f} ({pearson_pvals['surpDiv_ratio']:.3f})", f"{spearman_corr['surpDiv_ratio']:.3f} ({spearman_pvals['surpDiv_ratio']:.3f})"]
    })

    # Return final DataFrame
    return results



In [3]:
import pandas as pd
from scipy.stats import ttest_ind

def ttest_metric(df):
    ratio_columns = [col for col in df.columns if col.endswith("ratio")]
    # Perform t-test for each ratio column
    t_test_results = {}
    for col in ratio_columns:
        group_0 = df[df["label"] == 0][col]
        group_1 = df[df["label"] == 1][col]

        # Perform t-test
        t_stat, p_value = ttest_ind(group_1, group_0, equal_var=False)  # Welch's t-test
        t_test_results[col] = {"t_stat": t_stat, "p_value": p_value}

    # Convert results to DataFrame for better visualization
    t_test_df = pd.DataFrame.from_dict(t_test_results, orient="index")
    
    # Insert the ratio names as the first column
    t_test_df.insert(0, 'Metric', t_test_df.index)
    
    # Reset the index and columns to 0, 1, 2,...
    t_test_df.reset_index(drop=True, inplace=True)
    
    # Set the first row to be blank and add 't_stat' and 'p_value' in the second row
    t_test_df.columns = [0,1,2]  # Rename the columns
    t_test_df.loc[-1] =  ["", 't_stat', 'p_value'] # Add a blank first row
    t_test_df.index = t_test_df.index + 1  # Shift index to make room for the new row
    t_test_df = t_test_df.sort_index()  # Re-sort the DataFrame by the index
    
    return t_test_df


### Intercorrélations

In [4]:
from scipy.stats import kendalltau
import pandas as pd
import numpy as np

def KTcorrel_metrics(df):
    # Select only ratio columns
    ratio_columns = [col for col in df.columns if col.endswith("_ratio")]

    # Initialize an empty DataFrame with "-" for the upper triangle
    kendall_matrix = pd.DataFrame(np.full((len(ratio_columns), len(ratio_columns)), "-", dtype="object"))

    # Compute Kendall's Tau for the lower triangular part and diagonal
    for i, col1 in enumerate(ratio_columns):
        for j, col2 in enumerate(ratio_columns):
            if i >= j:  # Lower triangular and diagonal part
                tau, p_value = kendalltau(df[col1], df[col2])
                # Format the result as 'tau_value (p_value)'
                kendall_matrix.iloc[i, j] = f"{tau:.3f} ({p_value:.3f})"

    # Insert the ratio column names as the first column
    kendall_matrix.insert(0, "Metrics", ratio_columns)

    # Insert a blank row as the first row (0, new_ratio, uniq_ratio, ...)
    kendall_matrix.loc[-1] = [""] + ratio_columns  # Add a blank row
    kendall_matrix.index = kendall_matrix.index + 1  # Shift the index
    kendall_matrix = kendall_matrix.sort_index()  # Sort the DataFrame to fix the index order

    # Set column numbers starting from 0, including the first column with the Metrics
    kendall_matrix.columns = [0] + list(range(1, len(ratio_columns) + 1))  # First column = 0

    return kendall_matrix


In [5]:
from itertools import combinations
import pandas as pd
import numpy as np
import rbo  # Assuming rbo is already installed

def rbo_metrics(df, p):
    # Select only ratio columns
    ratio_columns = [col for col in df.columns if col.endswith("_ratio")]

    # Initialize an empty DataFrame for the RBO matrix
    rbo_matrix = pd.DataFrame(np.ones((len(ratio_columns), len(ratio_columns))), columns=ratio_columns)

    # Ensure the DataFrame can hold string values (object dtype)
    rbo_matrix = rbo_matrix.astype("object")

    # Compute RBO for all pairs of ratio columns
    for col1, col2 in combinations(ratio_columns, 2):
        # Rank the values (from high to low)
        rank1 = df[col1].sort_values(ascending=False).index.tolist()
        rank2 = df[col2].sort_values(ascending=False).index.tolist()
        
        # Compute RBO (with given p value)
        rbo_score = rbo.RankingSimilarity(rank1, rank2).rbo(p=p)
        
        # Format the result
        rbo_matrix.loc[ratio_columns.index(col1), col2] = f"{rbo_score:.3f}"
        rbo_matrix.loc[ratio_columns.index(col2), col1] = f"{rbo_score:.3f}"

    # Fill the upper triangle with "_"
    for i in range(len(ratio_columns)):
        for j in range(i+1, len(ratio_columns)):
            rbo_matrix.iloc[i, j] = "-"

    # Insert the ratio column names as the first column
    rbo_matrix.insert(0, "Metrics", ratio_columns)

    # Adjust column names to start from 0
    # rbo_matrix.columns = range(len(rbo_matrix.columns))

     # Insert a blank row as the first row (0, new_ratio, uniq_ratio, ...)
    rbo_matrix.loc[-1] = [""] + ratio_columns  # Add a blank row
    rbo_matrix.index = rbo_matrix.index + 1  # Shift the index
    rbo_matrix = rbo_matrix.sort_index()  # Sort the DataFrame to fix the index order

    # Set column numbers starting from 0, including the first column with the Metrics
    rbo_matrix.columns = [0] + list(range(1, len(ratio_columns) + 1))  # First column = 0

    return rbo_matrix


In [69]:
rbo_metrics(df, 0.9)

Unnamed: 0,0,1,2,3,4
0,,new_ratio,uniq_ratio,diff_ratio,surpDiv_ratio
1,new_ratio,1.0,-,-,-
2,uniq_ratio,0.052,1.0,-,-
3,diff_ratio,0.008,0.000,1.0,-
4,surpDiv_ratio,0.000,0.006,0.000,1.0


### RL

In [6]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

def rL_full(df):
    # Define dependent and independent variables
    X = df[['new_ratio', 'uniq_ratio', 'diff_ratio', 'surpDiv_ratio']]  # 4 ratios
    X = sm.add_constant(X)  # Add a constant for the intercept
    y = df['label']

    # Fit the logistic regression model
    model1 = sm.Logit(y, X)
    result1 = model1.fit()

    # Extract Coefficients, Standard Errors, P-values
    coefficients_model1 = result1.params
    std_err_model1 = result1.bse
    pvalues_model1 = result1.pvalues

    # Extract Pseudo R-squared and Likelihood Ratio (LLR) p-value
    r2_model1 = result1.prsquared  # McFadden's R-squared
    llr_p_value = result1.llr_pvalue

    # Prepare a DataFrame for the output
    output_df = pd.DataFrame({
        0: ["", 'const', 'Newness', 'Difference', 'Uniqueness', 'Surprise', 'Pseudo R-square', 'LLR p-value'],
        1: [
            'coef',
            round(coefficients_model1.get('const', np.nan), 3),
            round(coefficients_model1.get('new_ratio', np.nan), 3),
            round(coefficients_model1.get('diff_ratio', np.nan), 3),
            round(coefficients_model1.get('uniq_ratio', np.nan), 3),
            round(coefficients_model1.get('surpDiv_ratio', np.nan), 3),
            round(r2_model1, 3),  # Pseudo R-square
            round(llr_p_value, 3)  # LLR p-value
        ],
        2: [
            'std err',
            round(std_err_model1.get('const', np.nan), 3),
            round(std_err_model1.get('new_ratio', np.nan), 3),
            round(std_err_model1.get('diff_ratio', np.nan), 3),
            round(std_err_model1.get('uniq_ratio', np.nan), 3),
            round(std_err_model1.get('surpDiv_ratio', np.nan), 3),
            np.nan,  # No std err for Pseudo R-square
            np.nan   # No std err for LLR p-value
        ],
        3: [
            'P>|t|',
            round(pvalues_model1.get('const', np.nan), 3),
            round(pvalues_model1.get('new_ratio', np.nan), 3),
            round(pvalues_model1.get('diff_ratio', np.nan), 3),
            round(pvalues_model1.get('uniq_ratio', np.nan), 3),
            round(pvalues_model1.get('surpDiv_ratio', np.nan), 3),
            np.nan,  # No p-value for Pseudo R-square
            np.nan   # No p-value for LLR p-value
        ]
    })


    return output_df


In [73]:
rL_full(df)

Optimization terminated successfully.
         Current function value: 0.550152
         Iterations 19


Unnamed: 0,0,1,2,3
0,,coef,std err,P>|t|
1,const,2.131,0.236,0.0
2,Newness,89.98,338.084,0.79
3,Difference,-0.254,0.05,0.0
4,Uniqueness,-3.179,0.511,0.0
5,Surprise,5.063,0.64,0.0
6,Pseudo R-square,0.012,,
7,LLR p-value,0.0,,


In [7]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

def rL_metricSeparate(df):
    # Define dependent variable
    y = df['label']

    # List of metrics (independent variables)
    metrics = ['new_ratio', 'uniq_ratio', 'diff_ratio', 'surpDiv_ratio']

    # Create an empty list to store DataFrames
    df_list = []

    # Loop through each metric and refit the model
    for metric in metrics:
        # Define independent variable (X) as the metric
        X = df[[metric]]
        X = sm.add_constant(X)  # Add a constant for the intercept

        # Fit the logistic regression model
        model = sm.Logit(y, X)
        result = model.fit()

        # Extract coefficients, standard errors, p-values
        coef_const = round(result.params['const'], 3)
        coef_metric = round(result.params[metric], 3)
        std_err_const = round(result.bse['const'], 3)
        std_err_metric = round(result.bse[metric], 3)
        p_value_const = round(result.pvalues['const'], 3)
        p_value_metric = round(result.pvalues[metric], 3)

        # Extract Pseudo R-squared and Likelihood Ratio (LLR) p-value
        r2 = round(result.prsquared, 3)
        llr_p_value = f"{result.llr_pvalue:.3E}"

        # Create a DataFrame for this metric
        metric_df = pd.DataFrame({
            0: ['', 'const', metric, 'Pseudo R-square', 'LLR p-value'],
            1: ['coef', coef_const, coef_metric, r2, llr_p_value],
            2: ['std err', std_err_const, std_err_metric, "", ""],
            3: ['P>|t|', p_value_const, p_value_metric, "", ""]
        })

        # Append DataFrame and a blank row
        df_list.append(metric_df)
        df_list.append(pd.DataFrame({0: [""], 1: [""], 2: [""], 3: [""]}))  # Blank row

    # Concatenate all DataFrames
    final_df = pd.concat(df_list, ignore_index=True)

    return final_df


In [76]:
rL_metricSeparate(df)

Optimization terminated successfully.
         Current function value: 0.555513
         Iterations 16
Optimization terminated successfully.
         Current function value: 0.552403
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.553181
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.555014
         Iterations 5


Unnamed: 0,0,1,2,3
0,,coef,std err,P>|t|
1,const,1.636,0.073,0.0
2,new_ratio,-2004.447,277.535,0.0
3,Pseudo R-square,0.002,,
4,LLR p-value,6.054E-13,,
5,,,,
6,,coef,std err,P>|t|
7,const,3.443,0.179,0.0
8,uniq_ratio,-4.669,0.357,0.0
9,Pseudo R-square,0.008,,


In [9]:
import importlib
import textCleaning
importlib.reload(textCleaning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edgar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edgar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'textCleaning' from 'c:\\Users\\edgar\\OneDrive\\Bureau\\Ecole\\HEC\\A24\\BrevetNLP\\PatentNovelty\\novelty\\textCleaning.py'>

In [10]:
path = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données/metrics/"
from textCleaning import get_file_names, extract_year_ipc_vs
p=0.9

In [11]:
import pandas as pd

def merge_dataframes_with_blank_lines(df_list, df_names):
    """
    Merges a list of DataFrames into a single DataFrame with blank rows between them.
    Ensures column alignment and fills missing values with blanks instead of NaN.
    Adds DataFrame names in the first column of blank rows and avoids an extra blank row.

    Parameters:
        df_list (list of pd.DataFrame): List of DataFrames to merge.
        df_names (list of str): List of names corresponding to the DataFrames in df_list.

    Returns:
        pd.DataFrame: A merged DataFrame with blank rows in between and names in the first column.
    """
    # Collect all column names and ensure they are strings
    all_columns = set()
    for df in df_list:
        all_columns.update(map(str, df.columns))  # Convert all column names to strings

    # Ensure consistent column order
    all_columns = sorted(all_columns, key=str)  # Sort as strings to avoid type errors

    # Standardize DataFrames by including all columns
    standardized_dfs = [df.rename(columns=str).reindex(columns=all_columns, fill_value="") for df in df_list]

    # Create a blank row DataFrame with the correct columns
    blank_row = pd.DataFrame([[""] * len(all_columns)], columns=all_columns)

    # Create a list to hold the DataFrames with their names
    merged_dfs_with_names = []

    # Interleave blank rows between DataFrames and add the names in the first column of the blank row
    for df, name in zip(standardized_dfs, df_names):
        # Create a blank row with the name of the DataFrame in the first column and blanks for others
        name_row = pd.DataFrame([[name] + [""] * (len(all_columns)-1)], columns=all_columns)
        merged_dfs_with_names.append(name_row)  # Add the name row
        merged_dfs_with_names.append(df)  # Add the DataFrame itself
        merged_dfs_with_names.append(blank_row)  # Add a blank row after each DataFrame

    # Merge all DataFrames, interleaving blank rows between them
    merged_df = pd.concat(merged_dfs_with_names, ignore_index=True)

    return merged_df


In [12]:
import pandas as pd

# Function to output data to Excel
def output_to_excel(df_list, sheet_names, output_file):
    """
    Outputs the list of DataFrames to an Excel file, each DataFrame on a separate sheet.

    Parameters:
        df_list (list of pd.DataFrame): List of DataFrames to output.
        sheet_names (list of str): List of sheet names corresponding to each DataFrame.
        output_file (str): Path to the output Excel file.

    Returns:
        None
    """
    # Create a Pandas Excel writer object
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        for df, sheet_name in zip(df_list, sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"DataFrames written to {output_file}")

# Initialize a list to store DataFrames and their corresponding sheet names
df_list = []
sheet_names = []

# Assuming get_file_names(path) gives a list of filenames
for file in get_file_names(path):  # You may adjust the slicing as needed
    print(file)
    print(extract_year_ipc_vs(file))
    df = pd.read_csv(path+file)
    
    # Call functions to generate DataFrames for each metric
    correl = correl_labelScores(df)
    kt = KTcorrel_metrics(df)
    ttest_df = ttest_metric(df)
    rbo_df = rbo_metrics(df, p)
    rL = rL_full(df)
    rL_ind = rL_metricSeparate(df)
    
    # Merge DataFrames with blank rows and names
    final_df = merge_dataframes_with_blank_lines([correl, kt, ttest_df, rbo_df, rL, rL_ind], ['Corrélation', 'Kendall-Tau', "t-test", 'RBO', 'RL (MLE)', 'RL_ind (MLE)'])
    
    # Extract and join the year information to form the sheet name
    sheet_name = ('_'.join(extract_year_ipc_vs(file)))[:31]  # Joining the list elements to form a single string
    df_list.append(final_df)
    sheet_names.append(sheet_name)  # Use the joined string as the sheet name

# Output all DataFrames to an Excel file
output_to_excel(df_list, sheet_names, path +"/output.xlsx")


2012_H01L_abstract_summary_vs_abstract_summary_background_Metrics.csv
('2012', 'H01L', 'abstract_summary_vs_abstract_summary_background')
Optimization terminated successfully.
         Current function value: 0.553340
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.555767
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.554124
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.554222
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.556844
         Iterations 5
2013_H01L_abstract_summary_vs_abstract_summary_background_Metrics.csv
('2013', 'H01L', 'abstract_summary_vs_abstract_summary_background')
Optimization terminated successfully.
         Current function value: 0.511983
         Iterations 16
Optimization terminated successfully.
         Current function value: 0.514079
         Iterations 11
Optimi