In [None]:
import os
import pandas as pd

In [None]:
class Dataset:
    def __init__(self, folderpath):
        self.path = folderpath

    dataset = None

    def merge_x_and_y(self,x_file,y_file):
        merged_y = pd.read_csv(y_file)
        merged_x = pd.read_csv(x_file)
        merged_y = merged_y.iloc[:, 1:]
        dataset = pd.merge(merged_x,merged_y, on=['Year','Tiker'])

    def exportDataset(self):
        self.dataset.to_csv('dataset.csv')

class X_data(Dataset):
    
    def __init__(self, folderpath):
        super().__init__(folderpath)
    
    files = []
    
    def renameFiles(self):
        folderPath = self.path
        for file in os.listdir(folderPath):
            if not file.endswith(".xls"):
                continue
            excel_data = pd.read_excel(file, sheet_name=None, header=None)
            for sheet_name, sheet_data in excel_data.items():
                a1_value = sheet_data.iloc[0, 0]  # Accessing the A1 cell
            new_file_name = str(a1_value+".xlsx")
            with pd.ExcelWriter(new_file_name, engine='xlsxwriter') as writer:
                sheet_data.to_excel(writer, sheet_name=sheet_name, index=False)
            os.remove(file)
        return self
        
    file_list = [file for file in os.listdir(self.path) if file.endswith("xlsx")]

    df_list = []
    
    def getFiles(self):
        for i in range(0, len(self.file_list), 3):
            chunk = self.file_list[i:i+3]
            self.files.append(chunk)
        return self
    
    def removeJunk(self, df_list):
        for df in df_list:
            if 'TTM' in df.columns:
                df.drop('TTM',axis = 1,inplace = True)
        return
    
    def extractCommonColumns(self,df_list):
        balance_sheet_df, cash_flow_df, income_statement_df = df_list
    
        common_columns = set(balance_sheet_df.columns[1:]) & set(cash_flow_df.columns[1:]) & set(income_statement_df.columns[1:])
        common_columns = list(common_columns)
        
        columns_to_drop = []
        for df in df_list:
            columns_to_drop.extend(col for col in df.columns if col not in common_columns and col != df.columns[0])
        
        # Drop the columns that are not common to every dataframe (except the first columns)
        for df in df_list:
            df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
            df.fillna(0, inplace=True)
        
        return balance_sheet_df, cash_flow_df, income_statement_df
    
    def curate_x_data(self):
        for file in self.files:
            balance_sheet = pd.read_excel(file[0], sheet_name=None, header=1)
            for sheet_name, sheet_data in balance_sheet.items():
                balance_sheet_df = sheet_data
            cash_flow = pd.read_excel(file[1], sheet_name=None, header=1)
            for sheet_name, sheet_data in cash_flow.items():
                cash_flow_df = sheet_data
            imcome_statement = pd.read_excel(file[2], sheet_name=None, header=1)
            for sheet_name, sheet_data in imcome_statement.items():
                income_statement_df = sheet_data
            self.remove_junk([balance_sheet_df,cash_flow_df,income_statement_df])
            balance_sheet_df, cash_flow_df, income_statement_df = self.extractCommonColumns([balance_sheet_df,cash_flow_df,income_statement_df])
            
            balance_sheet_df.to_csv(str(file[0].rstrip(".xlsx")+".csv"), index=False, header=1)
            cash_flow_df.to_csv(str(file[1].rstrip(".xlsx")+".csv"), index=False, header=1)
            income_statement_df.to_csv(str(file[2].rstrip(".xlsx")+".csv"), index=False, header=1)
        return self
    
    def transpose_x_data(self):
        for file in os.listdir(self.path):
            if file.endswith(".csv"):
                df = pd.read_csv(file)
                df = df.iloc[:-1]
                df.to_csv(file,index=False,header=1)
                df = df = pd.read_csv(file)
                transposed_df = df.T
                # get 1st row as column names
                transposed_df.columns = transposed_df.iloc[0]
                # drop 1st row (column names)
                transposed_df = transposed_df[1:]
                transposed_df.to_csv(file)
        return self

    merged_list = []
    tiker_list = []

    def merge_triplets(self):
        file_list = [file for file in os.listdir(".") if file.endswith(".csv")]
        file_list.sort()
        df_list = [pd.read_csv(file) for file in file_list]
        for i in range(0, len(df_list), 3):
            group = df_list[i:i+3]
            merged_group = pd.concat(group, axis=1)
            merged_group.columns = merged_group.columns.str.strip()
            merged_group = merged_group.loc[:, ~merged_group.columns.duplicated()]
            tiker = file_list[i].split('_')[0]
            self.tiker_list.append(tiker)
            merged_group['Tiker'] = tiker
            self.merged_list.append(merged_group)
        return self
    
    def drop_outliers(self):
        dump_list = []
        counter = 0
        for df in self.merged_list:
            if "Gross Profit" not in df.columns.to_list():
                temp = df
                dump_list.append(counter)
            counter += 1 
        for i in range(len(dump_list)-1, -1, -1):
            self.merged_list.pop(dump_list[i])
            self.tiker_list.pop(dump_list[i])
        return self
    
    final_df = None
    
    def concat_x_data(self):
        self.final_df = pd.concat(self.merged_list, join='inner', ignore_index=True)
        self.final_df = self.final_df.rename(columns={'Unnamed: 0': 'Year'})
        self.final_df['Year'] = self.final_df['Year'].astype(int)
        return self

    def export_x_dataset(self):
        self.final_df.to_csv("X_dataset.csv") 
    
    def cleanup(self):
        for file in os.listdir("."):
            if file.endswith(".xlsx"):
                os.remove(file)
        return self

class Y_Data(Dataset):

    def __init__(self, folderpath):
        super().__init__(folderpath)

    
    def curate_y_data(self):
        with open('tickers.txt', 'r') as file:
            content_list = file.readlines()
        content_list = [line.strip() for line in content_list]

        files = [file.rstrip(".csv") for file in os.listdir(self.path) if file.endswith(".csv")]
        for file in content_list:
            if file not in files:
                print(file)

        for file in files:
            df = pd.read_csv(str(file+".csv"))
            df['Date'] = pd.to_datetime(df['Date'])
            filtered_df = df[df['Date'] >= '2014-01-01']
            filtered_df = filtered_df[['Open','Date']]
            filtered_df = filtered_df.groupby(filtered_df['Date'].dt.year).first().reset_index(drop=True)
            filtered_df['Year'] = filtered_df['Date'].dt.year
            filtered_df['return'] = (filtered_df['Open'] - filtered_df['Open'].shift(1)) / filtered_df['Open'].shift(1)
            filtered_df['return'] = filtered_df['return'].shift(-1)
            filtered_df.to_csv(str(file+".csv"), index=False)
        return self

    merged_list = []
    merged_df = None
    
    def megre_y_data(self):
        for file in os.listdir("."):
            if file.endswith(".csv"):
                df = pd.read_csv(file)
                df['Tiker'] = file.split('.')[0]
                self.merged_list.append(df)
        self.merged_df = pd.concat(self.merged_list, axis=0, ignore_index=True)
        return self
    
    def export_y_dataset(self):
        self.merged_df.to_csv("y_merged.csv")
    
    