# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [22]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import StandardScaler


In [33]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe.

        return: Pandas DataFrame.

        '''
        list_columns = [
            'symbol'
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
        ]

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            try:
                print('[INFO] Gathering {} stock data...'.format(symbol))
                ticker = yf.Ticker(symbol)

                # Create DataFrame from Company Info
                company_info = ticker.info
                df_info = pd.DataFrame.from_dict(company_info, orient='index').T

                # Remove unnecessary column
                df_info.drop(df_info.columns.difference(list_columns), 1, inplace=True)

                df_info['Symbol'] = [symbol]

                # Add to the main dataframe
                df = df.append(df_info, ignore_index=True)
            except:
                pass

        # Fill NaN value to 0
        df = df.fillna(0)

        # Normalize the data
        df[list_columns] = StandardScaler().fit_transform(df[list_columns])

        print(df.head())

        export = df.to_csv('Stock Dataset.csv', index=False)
        print('[SUCCESS] Stock Dataset has been exported to the local directory.')


In [34]:
with open('tickers.json', 'r') as f:
    json_tickers = f.read()
    list_tickers = json.loads(json_tickers)

clustering = Clustering(list_tickers)

In [35]:
clustering.gather_data()

[INFO] Gathering AAPL stock data...
[INFO] Gathering MSFT stock data...
   symbolfullTimeEmployees  trailingAnnualDividendYield  payoutRatio  \
0                      0.0                         -1.0         -1.0   
1                      0.0                          1.0          1.0   

   averageDailyVolume10Day  trailingAnnualDividendRate  averageVolume10days  \
0                      1.0                        -1.0                  1.0   
1                     -1.0                         1.0                 -1.0   

   dividendRate  beta  priceHint  trailingPE  ...  trailingEps  \
0          -1.0   1.0        0.0        -1.0  ...         -1.0   
1           1.0  -1.0        0.0         1.0  ...          1.0   

   lastDividendValue  priceToBook  heldPercentInsiders  shortRatio  \
0               -1.0          1.0                  1.0        -1.0   
1                1.0         -1.0                 -1.0         1.0   

   enterpriseValue  earningsQuarterlyGrowth  pegRatio  shortPer