# 6. Classify companies and conduct statistical tests 
-------------------
Group 3 , September 24, 2022
1. Gezhi Cheng, 
2. Haowei Lee, 
3. Ziyi Liu, 
4. VS Chaitanya Madduri

> <i>Description: The program in this notebook
- classified records into different groups based on two dimension
    1. frequent restructuring
    2. positive change in profitability
- conduct the classification for three consecutive years
- conduct statistical tests
</i>


<div class="alert alert-block alert-info">
    <b>Tip:</b> # Please run this notebook in the colab . 
</div> 

### Pre requisites: 
1. And add the shortcut of the drive link :https://drive.google.com/drive/folders/1X4UdGsQiHVWSr63FRiz8rwOuWW5Ua8uI?usp=sharing to your personal drive.


Due to the huge files we have used our personal google drive folders to save the files.

Files:
- restructuring_data.csv - Data of company information and classification on frequent restructuring


### Output files:

Files:
None

## Load dependencies

In [1]:
# Mount the program to the folder on Google Drive to get access to other files on cloud
from google.colab import drive
drive.mount('/content/drive')

# import python packages
from scipy import stats
import pandas as pd
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# define the path of the root folder and of the data
DIR_PATH = "/content/drive/MyDrive/SPM_files/"  # take the current folder as root folder
RESTRUCTURING_DATA_PATH = "restructuring_data.csv"
LIMIT =3

COLUMNS_ORDER = ["Company_Key", "Year", 'compamy_name', "profitability", 'profitability_next_1',
       'profitability_next_2', 'profitability_next_3', 'change_next_1',
       'change_next_2', 'change_next_3', 'is_restructuring']

IS_RESTRUCTURING = "is_restructuring"
POSITIVE_CHANGE_1 = "positive_change_1"
POSITIVE_CHANGE_2 = "positive_change_2"
POSITIVE_CHANGE_3 = "positive_change_3"
CLASS = "class"

PREFIX_ONE = "A"
PREFIX_TWO = "B" 
PREFIX_THREE = "C" 

In [3]:
df = pd.read_csv(DIR_PATH + RESTRUCTURING_DATA_PATH)  # read in the data with profitability and restructuring frequency
df = df.sort_values(by=['Company_Key', 'Year'])       # sort the records by company number and year
df = df[COLUMNS_ORDER]                                # sort the colum by given order
print(df.shape)

(486, 11)


In [4]:
# add positive change columns for three years, and set default values to zero
df[POSITIVE_CHANGE_1] = 0
df[POSITIVE_CHANGE_2] = 0
df[POSITIVE_CHANGE_3] = 0

# classify the values of positive changes as one, and negative changes as zero
df[POSITIVE_CHANGE_1] = df["change_next_1"].apply(lambda x: 1 if x > 0 else 0)
df[POSITIVE_CHANGE_2] = df["change_next_2"].apply(lambda x: 1 if x > 0 else 0)
df[POSITIVE_CHANGE_3] = df["change_next_3"].apply(lambda x: 1 if x > 0 else 0)

In [5]:
# copy the original dataframe for three times for group classification in three different years
df_y1 = df.copy()
df_y2 = df.copy()
df_y3 = df.copy()

# put data and POSITIVE_CHANGE into a list of tuples for convenience in processing
dfs = [(df_y1, POSITIVE_CHANGE_1),
       (df_y2, POSITIVE_CHANGE_2),
       (df_y3, POSITIVE_CHANGE_3)]


# add positive change columns for three years, and set default values to zero
for df in dfs:
    df[0][CLASS] = 0

In [6]:
def get_class(df, year_gap, restructure_col, profit_col):

    if year_gap == 1:
        pre = PREFIX_ONE
    elif year_gap == 2:
       pre = PREFIX_TWO
    elif year_gap == 3:
       pre = PREFIX_THREE

    # define the filters for four categories
    filter_1 = (df[restructure_col] == 1) & (df[profit_col] == 0)
    filter_2 = (df[restructure_col] == 1) & (df[profit_col] == 1)
    filter_3 = (df[restructure_col] == 0) & (df[profit_col] == 0)
    filter_4 = (df[restructure_col] == 0) & (df[profit_col] == 1)


    df[CLASS][filter_1] = f"{pre}1"
    df[CLASS][filter_2] = f"{pre}2"
    df[CLASS][filter_3] = f"{pre}3"
    df[CLASS][filter_4] = f"{pre}4"

    return df

In [7]:
import warnings
warnings.filterwarnings("ignore")

# apply get_class function on three dataframes to classify rocords into four groups
for i in range(len(dfs)):
    df, change = dfs[i][0], dfs[i][1]
    df = df.pipe(get_class, i+1, IS_RESTRUCTURING, change)
    print(df[CLASS].value_counts())

A3    206
A4    133
A1     82
A2     65
Name: class, dtype: int64
B3    178
B4    161
B1     74
B2     73
Name: class, dtype: int64
C3    196
C4    143
C2     77
C1     70
Name: class, dtype: int64


## End of classification

## Start statistical test

In [8]:
def round_number(x):
    if x < 0:
        x = str(x)[:6]
    elif x > 0:
        x = str(x)[:5]

    return x

In [9]:
class Test:

    def __init__(self, data, column):
        self.data = data
        
        arr = self.data[column].unique()
        arr.sort()

        self.column = column
        self.classes =arr

        self.independent_ttest_result = None
        self.paired_ttest_result = None


    def independent_ttest(self, column_to_compare):


        filter_12 = (self.data[self.column] == self.classes[0]) | (self.data[self.column] == self.classes[1])
        filter_34 = (self.data[self.column] == self.classes[2]) | (self.data[self.column] == self.classes[3])


        results = stats.ttest_ind(self.data[filter_12][column_to_compare], self.data[filter_34][column_to_compare])
        
        statistic = round_number(results[0])
        pvalue = round_number(results[1])

        self.independent_ttest_result = (statistic, pvalue)

    def paired_ttest(self, col1, col2):

        filter_12 = (self.data[self.column] == self.classes[0]) | (self.data[self.column] == self.classes[1])


        results = stats.ttest_rel(self.data[filter_12][col1], self.data[filter_12][col2])

        statistic = round_number(results[0])
        pvalue = round_number(results[1])

        self.paired_ttest_result = (statistic, pvalue)

    def get_test_result(self):
        # compare among four classes
        # between(independent t-test)
        # class1+2 vs. class 3+4(Does the impact of restructure on profitability change significant?)
        print("between(independent t-test)")
        print(f"statistic={self.independent_ttest_result[0]}, pvalue={self.independent_ttest_result[1]}")

        print("")

        # within(Paired t-test)
        # before the restructrue and after the restructure
        # class1+class2
        print("within(Paired t-test)")
        print(f"statistic={self.paired_ttest_result[0]}, pvalue={self.paired_ttest_result[1]}")

##1-year lag

In [10]:
# compare among class A1234
test1 = Test(df_y1, "class")
test1.independent_ttest("change_next_1")
test1.paired_ttest("profitability", "profitability_next_1")
test1.get_test_result()

between(independent t-test)
statistic=-1.658, pvalue=0.097

within(Paired t-test)
statistic=-2.331, pvalue=0.021


## 2-year lag

In [11]:
# compare among class B1234
test2 = Test(df_y2, "class")
test2.independent_ttest("change_next_2")
test2.paired_ttest("profitability", "profitability_next_2")
test2.get_test_result()

between(independent t-test)
statistic=-1.661, pvalue=0.097

within(Paired t-test)
statistic=-3.522, pvalue=0.000


## 3-year lag

In [12]:
# compare among class C1234
test3 = Test(df_y3, "class")
test3.independent_ttest("change_next_3")
test3.paired_ttest("profitability", "profitability_next_3")
test3.get_test_result()

between(independent t-test)
statistic=0.286, pvalue=0.774

within(Paired t-test)
statistic=-3.379, pvalue=0.000
