In [33]:
import pandas as pd
import datetime
from collections import Counter
import os
import shutil

In [35]:
final_financial_data_directory = 'final_financial_data'
final_financial_data_in_date_range_directory = 'final_financial_data_in_date_range'

In [17]:
eligible_companies = pd.read_csv('companies_eligible_to_financial_dataset_2024-04-19_14-47-55.csv')
eligible_companies.head()

Unnamed: 0.1,Unnamed: 0,cik,company,label,ticker,gurufocus-company-name,gurufocus-stockid,filing_date,cik-equal-gurufocus-cik,first_match
0,0,1800,ABBOTT LABORATORIES,False,ABT,Abbott Laboratories,US066X,2021-02-19,True,
1,1,2488,ADVANCED MICRO DEVICES INC,False,AMD,Advanced Micro Devices Inc,US022E,2021-01-29,True,
2,2,2491,"BALLY TECHNOLOGIES, INC.",False,BYI,Bally Technologies Inc (Delisted),US06R5,2014-08-29,True,
3,3,2601,AEROFLEX INC,False,ARXX,Aeroflex Inc (Delisted),US026H,2006-09-13,True,
4,4,2969,AIR PRODUCTS & CHEMICALS INC /DE/,False,APD,Air Products & Chemicals Inc,US06DU,2021-11-18,True,


In [18]:
eligible_companies['label'].value_counts()

label
False    5291
True      126
Name: count, dtype: int64

In [19]:
final_eligible_companies = eligible_companies.copy()

In [20]:
def set_date_difference(final_eligible_companies):

    for index, row in final_eligible_companies.iterrows():
        cik = row['cik']
        gurufocus_stockid = row['gurufocus-stockid']
        ticker = row['ticker']
        filing_date = row['filing_date']
            
        filepath = f'{final_financial_data_directory}/{cik}-{gurufocus_stockid}_{ticker}.csv'
            
        df = pd.read_csv(filepath)
        last_column_name = df.columns[-1]
        split_date = last_column_name.split(' ')
        month, year = split_date[0], split_date[1]
        year = int(year.split('.')[0])
        
        if 0 <= year <= 9:
            year = f'200{year}'
        elif 10 <= year <= 24:
            year = f'20{year}'
        elif year > 24:
            year = f'19{year}'
            
        gurufocus_last_data_date = datetime.datetime.strptime(f'{month} {year}', '%b %Y')
        
        filing_date = filing_date[:7]
        filing_date = datetime.datetime.strptime(f'{filing_date}', '%Y-%m')

        difference_in_months = int((gurufocus_last_data_date - filing_date).days / 30)
        
        final_eligible_companies.loc[final_eligible_companies['cik'] == cik, 'difference_in_months'] = difference_in_months
        
    return final_eligible_companies

In [21]:
final_eligible_companies = set_date_difference(final_eligible_companies)

In [22]:
counts_df = final_eligible_companies['difference_in_months'].value_counts()
# set column names
counts_df = counts_df.reset_index()
counts_df.columns = ['difference_in_months', 'count']
counts_df.head()

Unnamed: 0,difference_in_months,count
0,34.0,1172
1,33.0,650
2,6.0,519
3,0.0,514
4,3.0,337


In [36]:
def get_companies_within_date_range(final_eligible_companies):
    mask = (final_eligible_companies['difference_in_months'] > -12) & (final_eligible_companies['difference_in_months'] < 36)
    eligible_companies_in_eligible_range = final_eligible_companies[mask]
    eligible_companies_in_eligible_range['difference_in_months'] = eligible_companies_in_eligible_range['difference_in_months'].astype(int)
    return eligible_companies_in_eligible_range

def drop_unused_columns(final_eligible_companies):
    final_eligible_companies.drop(columns=['cik-equal-gurufocus-cik', 'first_match'], inplace=True)
    return final_eligible_companies

#### Processing I

In [30]:
final_eligible_companies = get_companies_within_date_range(final_eligible_companies)
final_eligible_companies = drop_unused_columns(final_eligible_companies)

print(len(final_eligible_companies))
final_eligible_companies['label'].value_counts(normalize=True)

5055


label
False    0.976657
True     0.023343
Name: proportion, dtype: float64

In [None]:
final_eligible_companies.to_csv('eligible_companies_in_eligible_range.csv', index=False)

In [34]:
for index, row in final_eligible_companies.iterrows():
    cik = row['cik']
    gurufocus_stockid = row['gurufocus-stockid']
    ticker = row['ticker']
    filing_date = row['filing_date']
    
    filepath = f'{final_financial_data_directory}/{cik}-{gurufocus_stockid}_{ticker}.csv'
        
    shutil.copy(filepath, f'./{final_financial_data_in_date_range_directory}/{cik}-{gurufocus_stockid}_{ticker}.csv')

#### Processing II

In [38]:
with open('chosen_variables.txt', 'r') as file:
    chosen_variables = file.read().splitlines()
    
with open('most_common_variables.txt', 'r') as file:
    most_common_variables = file.read().splitlines()

In [None]:
def delete_duplicated_rows(df):
    df.drop(df.index[df.iloc[:, 1].duplicated(keep=False)], inplace=True)
    return df

def has_dataframe_enough_variables(df, variables, threshold):
    csv_variable = df.iloc[:, 1].tolist()
    cnt = 0
    for chosen_variable in variables:
        if chosen_variable not in csv_variable:
            cnt += 1
            
    if cnt <= threshold:
        return True
    else:
        return False

In [None]:
for filename in os.listdir(final_financial_data_in_date_range_directory):
    df = pd.read_csv(f'{final_financial_data_in_date_range_directory}/{filename}')
    
    df = delete_duplicated_rows(df)
    
    # invert df
    
    
    