In [1]:
import pandas as pd

# Load the saved dataset
unlabeled_data = pd.read_csv("unlabeled_data.csv")

# Ensure the date column is correctly parsed as datetime
unlabeled_data['Earnings_Call_Date'] = pd.to_datetime(unlabeled_data['Earnings_Call_Date'])

# Print the first few rows to verify
print(unlabeled_data.head())


  Stock_Name Earnings_Call_Date  Earnings_Surprise     Firm_Size  CAR_3_Days  \
0        STZ         2020-01-08           0.169399  3.650714e+10   -0.014271   
1        RPM         2020-01-08           0.041096  9.467565e+09   -0.008348   
2        PTC         2020-01-22           0.266667  9.242700e+09    0.094707   
3         PG         2020-01-23           0.036496  3.170399e+11    0.014272   
4        RJF         2020-01-23           0.000000  1.262831e+10    0.017126   

   CAR_10_Days  CAR_30_Days  method_1  method_2  method_3  
0     0.010221     0.083406  0.432441  0.768821  0.486680  
1    -0.026677    -0.059266  0.194338  0.378917  0.132254  
2     0.089449     0.049796  0.318199  0.427348  0.141501  
3     0.005064     0.015538  0.397074  0.502051  0.364789  
4     0.026210    -0.054972  0.251476  0.127434  0.137728  


In [6]:
# Create two copies of the dataset
df1 = unlabeled_data.copy()
df2 = unlabeled_data.copy()
df3 = unlabeled_data.copy()

# Verify the copies
print(f"df1 shape: {df1.shape}")
print(f"df2 shape: {df2.shape}")
print(f"df3 shape: {df3.shape}")

df1 shape: (8306, 10)
df2 shape: (8306, 10)
df3 shape: (8306, 10)


In [10]:
# Create label_1 column: 1 if CAR_3_Days > 0, else 0
df1['label_1'] = (df1['CAR_3_Days'] > 0).astype(int)

# Verify the new column
print(df1[['CAR_3_Days', 'label_1']].head())

# Create label_2 column: 1 if CAR_10_Days > 0, else 0
df1['label_2'] = (df1['CAR_10_Days'] > 0).astype(int)

# Verify the new column
print(df1[['CAR_10_Days', 'label_2']].head())

# Create label_3 column: 1 if CAR_30_Days > 0, else 0
df1['label_3'] = (df1['CAR_30_Days'] > 0).astype(int)

# Verify the new column
print(df1[['CAR_30_Days', 'label_3']].head())

print(df1.head())


   CAR_3_Days  label_1
0   -0.014271        0
1   -0.008348        0
2    0.094707        1
3    0.014272        1
4    0.017126        1
   CAR_10_Days  label_2
0     0.010221        1
1    -0.026677        0
2     0.089449        1
3     0.005064        1
4     0.026210        1
   CAR_30_Days  label_3
0     0.083406        1
1    -0.059266        0
2     0.049796        1
3     0.015538        1
4    -0.054972        0
  Stock_Name Earnings_Call_Date  Earnings_Surprise     Firm_Size  CAR_3_Days  \
0        STZ         2020-01-08           0.169399  3.650714e+10   -0.014271   
1        RPM         2020-01-08           0.041096  9.467565e+09   -0.008348   
2        PTC         2020-01-22           0.266667  9.242700e+09    0.094707   
3         PG         2020-01-23           0.036496  3.170399e+11    0.014272   
4        RJF         2020-01-23           0.000000  1.262831e+10    0.017126   

   CAR_10_Days  CAR_30_Days  method_1  method_2  method_3  label_1  label_2  \
0     0.010221

In [None]:
def assign_rank(df):
    """
    Assigns a rank based on the number of labels (label_1, label_2, label_3) that are equal to 1.

    Rank Rules:
    - 3 labels = 1 → Rank 1
    - 2 labels = 1 → Rank 2
    - 1 label = 1 → Rank 3
    - 0 labels = 1 → Rank 4

    Parameters:
        df (pd.DataFrame): DataFrame containing label_1, label_2, and label_3 columns.

    Returns:
        pd.DataFrame: Updated DataFrame with a new 'rank' column.
    """
    df['rank'] = df[['label_1', 'label_2', 'label_3']].sum(axis=1).map({3: 1, 2: 2, 1: 3, 0: 4})
    return df

# Apply the function to df1
df1 = assign_rank(df1)

# Verify the new column
print(df1.head())


  Stock_Name Earnings_Call_Date  Earnings_Surprise     Firm_Size  CAR_3_Days  \
0        STZ         2020-01-08           0.169399  3.650714e+10   -0.014271   
1        RPM         2020-01-08           0.041096  9.467565e+09   -0.008348   
2        PTC         2020-01-22           0.266667  9.242700e+09    0.094707   
3         PG         2020-01-23           0.036496  3.170399e+11    0.014272   
4        RJF         2020-01-23           0.000000  1.262831e+10    0.017126   

   CAR_10_Days  CAR_30_Days  method_1  method_2  method_3  label_1  label_2  \
0     0.010221     0.083406  0.432441  0.768821  0.486680        0        1   
1    -0.026677    -0.059266  0.194338  0.378917  0.132254        0        0   
2     0.089449     0.049796  0.318199  0.427348  0.141501        1        1   
3     0.005064     0.015538  0.397074  0.502051  0.364789        1        1   
4     0.026210    -0.054972  0.251476  0.127434  0.137728        1        1   

   label_3  rank  
0        1     2  
1     

In [18]:
# Save the processed DataFrame as 'df1.csv'
df1.to_csv("df1.csv", index=False)

print("Processed data saved as 'df1.csv'.")

Processed data saved as 'df1.csv'.


In [16]:
# Calculate mean values for CAR_3_Days, CAR_10_Days, and CAR_30_Days
car_means = df2[['CAR_3_Days', 'CAR_10_Days', 'CAR_30_Days']].mean()

# Create label_1: 1 if CAR_3_Days > mean, else 0
df2['label_1'] = (df2['CAR_3_Days'] > car_means['CAR_3_Days']).astype(int)

# Create label_2: 1 if CAR_10_Days > mean, else 0
df2['label_2'] = (df2['CAR_10_Days'] > car_means['CAR_10_Days']).astype(int)

# Create label_3: 1 if CAR_30_Days > mean, else 0
df2['label_3'] = (df2['CAR_30_Days'] > car_means['CAR_30_Days']).astype(int)

# Verify the new labels
print("Mean Values Used for Labeling:")
print(car_means)
print(df2[['CAR_3_Days', 'label_1', 'CAR_10_Days', 'label_2', 'CAR_30_Days', 'label_3']].head(100))


Mean Values Used for Labeling:
CAR_3_Days     0.003126
CAR_10_Days    0.003208
CAR_30_Days    0.002031
dtype: float64
    CAR_3_Days  label_1  CAR_10_Days  label_2  CAR_30_Days  label_3
0    -0.014271        0     0.010221        1     0.083406        1
1    -0.008348        0    -0.026677        0    -0.059266        0
2     0.094707        1     0.089449        1     0.049796        1
3     0.014272        1     0.005064        1     0.015538        1
4     0.017126        1     0.026210        1    -0.054972        0
..         ...      ...          ...      ...          ...      ...
95    0.028566        1     0.003689        1     0.017732        1
96    0.029075        1     0.034162        1    -0.007515        0
97    0.029962        1     0.096126        1     0.186042        1
98    0.157220        1     0.148475        1     0.218370        1
99    0.000053        0    -0.022676        0     0.017140        1

[100 rows x 6 columns]


In [17]:
# Apply the function to df1
df2 = assign_rank(df2)

# Verify the new column
print(df2.head())

  Stock_Name Earnings_Call_Date  Earnings_Surprise     Firm_Size  CAR_3_Days  \
0        STZ         2020-01-08           0.169399  3.650714e+10   -0.014271   
1        RPM         2020-01-08           0.041096  9.467565e+09   -0.008348   
2        PTC         2020-01-22           0.266667  9.242700e+09    0.094707   
3         PG         2020-01-23           0.036496  3.170399e+11    0.014272   
4        RJF         2020-01-23           0.000000  1.262831e+10    0.017126   

   CAR_10_Days  CAR_30_Days  method_1  method_2  method_3  label_1  label_2  \
0     0.010221     0.083406  0.432441  0.768821  0.486680        0        1   
1    -0.026677    -0.059266  0.194338  0.378917  0.132254        0        0   
2     0.089449     0.049796  0.318199  0.427348  0.141501        1        1   
3     0.005064     0.015538  0.397074  0.502051  0.364789        1        1   
4     0.026210    -0.054972  0.251476  0.127434  0.137728        1        1   

   label_3  rank  
0        1     2  
1     

In [19]:
# Save the processed DataFrame as 'df2.csv'
df2.to_csv("df2.csv", index=False)

print("Processed data saved as 'df2.csv'.")

Processed data saved as 'df2.csv'.


In [20]:
# Calculate median values for CAR_3_Days, CAR_10_Days, and CAR_30_Days
car_medians = df3[['CAR_3_Days', 'CAR_10_Days', 'CAR_30_Days']].median()

# Create label_1: 1 if CAR_3_Days > median, else 0
df3['label_1'] = (df3['CAR_3_Days'] > car_medians['CAR_3_Days']).astype(int)

# Create label_2: 1 if CAR_10_Days > median, else 0
df3['label_2'] = (df3['CAR_10_Days'] > car_medians['CAR_10_Days']).astype(int)

# Create label_3: 1 if CAR_30_Days > median, else 0
df3['label_3'] = (df3['CAR_30_Days'] > car_medians['CAR_30_Days']).astype(int)

# Verify the new labels
print("Median Values Used for Labeling:")
print(car_medians)
print(df3[['CAR_3_Days', 'label_1', 'CAR_10_Days', 'label_2', 'CAR_30_Days', 'label_3']].head(100))


Median Values Used for Labeling:
CAR_3_Days     0.001031
CAR_10_Days    0.001170
CAR_30_Days   -0.001128
dtype: float64
    CAR_3_Days  label_1  CAR_10_Days  label_2  CAR_30_Days  label_3
0    -0.014271        0     0.010221        1     0.083406        1
1    -0.008348        0    -0.026677        0    -0.059266        0
2     0.094707        1     0.089449        1     0.049796        1
3     0.014272        1     0.005064        1     0.015538        1
4     0.017126        1     0.026210        1    -0.054972        0
..         ...      ...          ...      ...          ...      ...
95    0.028566        1     0.003689        1     0.017732        1
96    0.029075        1     0.034162        1    -0.007515        0
97    0.029962        1     0.096126        1     0.186042        1
98    0.157220        1     0.148475        1     0.218370        1
99    0.000053        0    -0.022676        0     0.017140        1

[100 rows x 6 columns]


In [21]:
# Save the processed DataFrame as 'df3.csv'
df3.to_csv("df3.csv", index=False)

print("Processed data saved as 'df3.csv'.")

Processed data saved as 'df3.csv'.
