# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [3]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import StandardScaler


In [4]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe.

        return: Pandas DataFrame.

        '''
        list_columns = [
            'symbol',
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
        ]

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            try:
                print('[INFO] Gathering {} stock data...'.format(symbol))
                ticker = yf.Ticker(symbol)

                # Create DataFrame from Company Info
                company_info = ticker.info
                df_info = pd.DataFrame.from_dict(company_info, orient='index').T

                df_info['symbol'] = [symbol]

                # Remove unnecessary column
                df_info.drop(df_info.columns.difference(list_columns), 1, inplace=True)

                # Add to the main dataframe
                df = df.append(df_info, ignore_index=True)
            except:
                pass

        # Fill NaN value to 0
        df = df.fillna(0)

        export = df.to_csv('Stock Dataset.csv', index=False)
        print('[SUCCESS] Stock Dataset has been exported to the local directory.')


In [5]:
with open('tickers.json', 'r') as f:
    json_tickers = f.read()
    list_tickers = json.loads(json_tickers)

clustering = Clustering(list_tickers)

# Run to gather the data
clustering.gather_data()

stock data...
[INFO] Gathering VINCW stock data...
[INFO] Gathering VINO stock data...
[INFO] Gathering VINP stock data...
[INFO] Gathering VIOG stock data...
[INFO] Gathering VIOO stock data...
[INFO] Gathering VIOT stock data...
[INFO] Gathering VIOV stock data...
[INFO] Gathering VIPS stock data...
[INFO] Gathering VIR stock data...
[INFO] Gathering VIRC stock data...
[INFO] Gathering VIRI stock data...
[INFO] Gathering VIRS stock data...
[INFO] Gathering VIRT stock data...
[INFO] Gathering VIRX stock data...
[INFO] Gathering VIS stock data...
[INFO] Gathering VISL stock data...
[INFO] Gathering VIST stock data...
[INFO] Gathering VITL stock data...
[INFO] Gathering VIV stock data...
[INFO] Gathering VIVE stock data...
[INFO] Gathering VIVO stock data...
[INFO] Gathering VIXM stock data...
[INFO] Gathering VIXY stock data...
[INFO] Gathering VJET stock data...
[INFO] Gathering VKI stock data...
[INFO] Gathering VKQ stock data...
[INFO] Gathering VKTX stock data...
[INFO] Gathering V

In [7]:
df = pd.read_csv('Stock Dataset.csv', index_col='symbol')
print(df.head())

        fullTimeEmployees  trailingAnnualDividendYield  payoutRatio  \
symbol                                                                
A                   16400                     0.006125       0.2823   
AA                  12900                     0.012024       0.0000   
AAA                     0                     0.000000       0.0000   
AAAU                    0                     0.000000       0.0000   
AACG                  768                     0.000000       0.0000   

        averageDailyVolume10Day  trailingAnnualDividendRate  \
symbol                                                        
A                       2072542                       0.734   
AA                      8897628                       0.360   
AAA                        4442                       0.000   
AAAU                     633314                       0.000   
AACG                     248600                       0.000   

        averageVolume10days  dividendRate      beta  priceHi