# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [4]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [106]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data
    - normalize_data
    - cluster_export

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list
        self.list_columns = [
            'symbol',
            'sector',
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
            'increment'
        ]
    
    def __get_increment(self, symbol):
        '''

        Get the increment of the stock in 1 month period.

        return: Float increment.

        '''

        stock_data = yf.download(
            tickers=symbol,
            period='1mo',
            interval='1wk'
        )

        stock_data = stock_data['Adj Close']

        latest_price = stock_data[-1]
        initial_price = stock_data[0]
        
        if initial_price is None:
            initial_price = stock_data[1]
        
        if latest_price is None:
            latest_price = stock_data[-2]

        increment = (latest_price - initial_price) / initial_price

        return increment

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe and export it to CSV file.

        return: None.

        '''

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            try:
                print('[INFO] Gathering {} stock data...'.format(symbol))
                ticker = yf.Ticker(symbol)

                # Create DataFrame from Company Info
                company_info = ticker.info
                df_info = pd.DataFrame.from_dict(company_info, orient='index').T

                if df_info['fullTimeEmployees'] is None:
                    pass
                else:
                    df_info['symbol'] = [symbol]

                    # Get increment
                    increment = self.__get_increment(symbol)
                    df_info['increment'] = [increment]

                    # Remove unnecessary column
                    df_info.drop(df_info.columns.difference(self.list_columns), 1, inplace=True)

                    # Add to the main dataframe
                    df = df.append(df_info, ignore_index=True)
            except:
                pass

        # Export dataset
        export = df.to_csv('Stock Dataset.csv', index=False)
        print('[SUCCESS] Stock Dataset has been exported to the local directory.')

    def normalize_data(self, path):
        '''

        Normalize the data.

        @param path: Folder directory of the normalized dataset.

        return: None.

        '''

        dict_sector = {
            'Healthcare': 1,
            'Basic Materials': 2,
            'Consumer Defensive': 3,
            'Real Estate': 4,
            'Industrials': 5,
            'Financial Services': 6,
            'Consumer Cyclical': 7,
            'Technology': 8,
            'Communication Services': 9,
            'Utilities': 10,
            'Energy': 11
        }

        df = pd.read_csv(path)

        # Replace inf to nan and remove the row
        df = df.fillna(0)
        df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

        # Index
        symbol_list = df['symbol'].values

        # Sector
        df['sector'] = df['sector'].replace(dict_sector)

        # Drop non digit columns
        df = df.drop(columns=['symbol'])

        # Normalize
        df_values = df.values
        scaler = StandardScaler()
        df_values_scaled = scaler.fit_transform(df_values)
        df = pd.DataFrame(df_values_scaled, columns=self.list_columns[1:])

        # Scale increment
        df['increment'] = np.multiply(df['increment'], 100)

        # Scale sector
        df['sector'] = np.multiply(df['sector'], 100)

        # Readd the sector and symbol columns
        df['symbol'] = symbol_list

        export = df.to_csv('Normalized Stock Dataset.csv', index=False)
        print('[SUCCESS] Normalized Stock Dataset has been exported to the local directory.')
        
    def cluster_export(self, path, index='symbol', n_clusters=500):
        '''

        Cluster the dataset and export to a new csv dataset with clustered value.

        @param path: Folder directory of the normalized dataset.
        @param index: Name of the column of index.
        @param n_clusters: Number of cluster

        return: None.

        '''

        df = pd.read_csv(path, index_col=index)

        model = KMeans(n_clusters=n_clusters)

        model = model.fit(df.values)

        # Cluster labels
        labels = model.labels_

        df['cluster'] = labels

        df = df['cluster']

        export = df.to_csv('Clustered Stock Dataset.csv')
        print('[SUCCESS] Clustered Stock Dataset has been exported to the local directory.')


In [107]:
with open('tickers.json', 'r') as f:
    json_tickers = f.read()
    list_tickers = json.loads(json_tickers)

test_tickers = ['AAPL', 'MSFT', 'VOO', 'AAAU', 'WNC']

clustering = Clustering(list_tickers)

# Run to gather the data
# clustering.gather_data()

In [108]:
df = pd.read_csv('Stock Dataset.csv')
print(len(df))
print(df.head())

4421
  symbol              sector  fullTimeEmployees  trailingAnnualDividendYield  \
0      A          Healthcare              16400                     0.005987   
1     AA     Basic Materials              12900                     0.011542   
2   AACG  Consumer Defensive                768                          NaN   
3   AAIC         Real Estate                 12                          NaN   
4    AAL         Industrials             102700                     0.004049   

   payoutRatio  averageDailyVolume10Day  trailingAnnualDividendRate  \
0       0.2823                1711725.0                       0.734   
1       0.0000                6112825.0                       0.360   
2       0.0000                 614150.0                         NaN   
3       0.0000                 187787.0                         NaN   
4          NaN               52531950.0                       0.100   

   averageVolume10days  dividendRate      beta  ...  trailingEps  \
0            171172

In [109]:
print(df['symbol'])

0          A
1         AA
2       AACG
3       AAIC
4        AAL
        ... 
4416    ZUMZ
4417     ZVO
4418    ZYME
4419    ZYNE
4420    ZYXI
Name: symbol, Length: 4421, dtype: object


In [110]:
print(df['sector'].unique().tolist())

['Healthcare', 'Basic Materials', 'Consumer Defensive', 'Real Estate', 'Industrials', 'Financial Services', 'Consumer Cyclical', 'Technology', 'Communication Services', 'Utilities', 'Energy', nan]


In [111]:
clustering.normalize_data('Stock Dataset.csv')

[SUCCESS] Normalized Stock Dataset has been exported to the local directory.


In [124]:
clustering.cluster_export('Normalized Stock Dataset.csv', n_clusters=200)

[SUCCESS] Clustered Stock Dataset has been exported to the local directory.


In [125]:
df_clustered = pd.read_csv('Clustered Stock Dataset.csv')

print(df_clustered.head())


  symbol  cluster
0      A      146
1     AA      144
2   AACG      117
3   AAIC       37
4    AAL      110


In [132]:
symbol_cluster = df_clustered.loc[df_clustered['symbol'] == 'CRM']['cluster'].values[0]
print(symbol_cluster)


165


In [133]:
same_cluster = df_clustered.loc[df_clustered['cluster'] == symbol_cluster]
print(same_cluster)


     symbol  cluster
194    ALRM      165
202    ALTR      165
274    AOSL      165
365    ASUR      165
422    AVNW      165
...     ...      ...
4028   UEIC      165
4169    VPG      165
4183   VRSN      165
4275   WORK      165
4311    WTT      165

[62 rows x 2 columns]
