# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [38]:
# Import libraries
import yfinance as yf
import pandas as pd

from sklearn.preprocessing import StandardScaler


In [39]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe.

        return: Pandas DataFrame.

        '''
        list_columns = [
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
        ]

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            ticker = yf.Ticker(symbol)

            # Create DataFrame from Company Info
            company_info = ticker.info
            df_info = pd.DataFrame.from_dict(company_info, orient='index').T

            # Remove unnecessary column
            df_info = df_info.loc[:, df.columns.intersection(list_columns)]

            # Add to the main dataframe
            df = df.append(df_info, ignore_index=True)

        # Fill NaN value to 0
        df = df.fillna(0)

        # Normalize the data
        df.iloc[:,:] = StandardScaler().fit_transform(df)

        print(df.head())

        # export = df.to_csv('Stock Dataset.csv', index=False)
        # print('[SUCCESS] Stock Dataset has been exported to the local directory.')


In [40]:
clustering = Clustering(['aapl', 'msft', 'tsla'])

In [41]:
clustering.gather_data()

   fullTimeEmployees  trailingAnnualDividendYield  payoutRatio  \
0           0.498955                     0.322815     0.316525   
1           0.896508                     1.031003     1.035412   
2          -1.395463                    -1.353818    -1.351937   

   averageDailyVolume10Day  trailingAnnualDividendRate  averageVolume10days  \
0                 1.411906                   -0.198700             1.411906   
1                -0.775896                    1.311946            -0.775896   
2                -0.636009                   -1.113246            -0.636009   

   dividendRate      beta  priceHint  trailingPE  ...  netIncomeToCommon  \
0     -0.216135 -0.251803        0.0   -0.708474  ...           0.925332   
1      1.318425 -1.079274        0.0   -0.705738  ...           0.463520   
2     -1.102289  1.331076        0.0    1.414213  ...          -1.388852   

   trailingEps  lastDividendValue  priceToBook  heldPercentInsiders  \
0     0.003634          -0.216135     0.93