# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [16]:
# Import libraries
import yfinance as yf
import pandas as pd


In [85]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe.

        return: Pandas DataFrame.

        '''
        list_columns = [
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
        ]

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            ticker = yf.Ticker(symbol)

            # Create DataFrame from Company Info
            company_info = ticker.info
            df_info = pd.DataFrame.from_dict(company_info, orient='index').T

            # Remove unnecessary column
            df_info = df_info.loc[:, df.columns.intersection(list_columns)]

            # Add to the main dataframe
            df = df.append(df_info, ignore_index=True)

        # Fill NaN value to 0
        df = df.fillna(0)

        # export = df.to_csv('Stock Dataset.csv', index=False)
        # print('[SUCCESS] Stock Dataset has been exported to the local directory.')


In [86]:
clustering = Clustering(['aapl', 'msft', 'tsla'])

In [87]:
clustering.gather_data()

fullTimeEmployees               False
trailingAnnualDividendYield      True
payoutRatio                     False
averageDailyVolume10Day         False
trailingAnnualDividendRate       True
averageVolume10days             False
dividendRate                     True
beta                            False
priceHint                       False
trailingPE                      False
regularMarketVolume             False
marketCap                       False
averageVolume                   False
priceToSalesTrailing12Months    False
forwardPE                       False
fiveYearAvgDividendYield         True
dividendYield                    True
enterpriseToRevenue             False
profitMargins                   False
enterpriseToEbitda              False
forwardEps                      False
bookValue                       False
sharesPercentSharesOut          False
heldPercentInstitutions         False
netIncomeToCommon               False
trailingEps                     False
lastDividend