# Clustering
This notebook is to demonstrate the function of gathering the data from the stock market and creating the clustering algorithm.

In [71]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [188]:
class Clustering():
    '''

    Class to gather stock dataset and training the model.

    methods:
    - gather_data
    - cluster_export

    '''

    def __init__(self, stock_list):
        self.stock_list = stock_list
    
    def __get_increment(self, symbol):
        '''

        Get the increment of the stock in 1 month period.

        return: Float increment.

        '''

        stock_data = yf.download(
            tickers=symbol,
            period='1mo',
            interval='1wk'
        )

        stock_data = stock_data['Adj Close']

        latest_price = stock_data[-1]
        initial_price = stock_data[0]
        
        if initial_price is None:
            initial_price = stock_data[1]
        
        if latest_price is None:
            latest_price = stock_data[-2]

        increment = (latest_price - initial_price) / initial_price

        return increment

    def gather_data(self):
        '''

        Gathering and formatting stock data into a dataframe and export it to CSV file.

        return: None.

        '''
        list_columns = [
            'symbol',
            'sector',
            'fullTimeEmployees',
            'trailingAnnualDividendYield',
            'payoutRatio',
            'averageDailyVolume10Day',
            'trailingAnnualDividendRate',
            'averageVolume10days',
            'dividendRate',
            'beta',
            'priceHint',
            'trailingPE',
            'regularMarketVolume',
            'marketCap',
            'averageVolume',
            'priceToSalesTrailing12Months',
            'forwardPE',
            'fiveYearAvgDividendYield',
            'dividendYield',
            'enterpriseToRevenue',
            'profitMargins',
            'enterpriseToEbitda',
            'forwardEps',
            'bookValue',
            'sharesPercentSharesOut',
            'heldPercentInstitutions',
            'netIncomeToCommon',
            'trailingEps',
            'lastDividendValue',
            'priceToBook',
            'heldPercentInsiders',
            'shortRatio',
            'enterpriseValue',
            'earningsQuarterlyGrowth',
            'pegRatio',
            'shortPercentOfFloat',
            'increment'
        ]

        df = pd.DataFrame(columns=list_columns)
        for symbol in self.stock_list:
            try:
                print('[INFO] Gathering {} stock data...'.format(symbol))
                ticker = yf.Ticker(symbol)

                # Create DataFrame from Company Info
                company_info = ticker.info
                df_info = pd.DataFrame.from_dict(company_info, orient='index').T

                df_info['symbol'] = [symbol]

                # Get increment
                increment = self.__get_increment(symbol)
                df_info['increment'] = [increment]

                # Remove unnecessary column
                df_info.drop(df_info.columns.difference(list_columns), 1, inplace=True)

                # Add to the main dataframe
                df = df.append(df_info, ignore_index=True)
            except:
                pass

        # Fill NaN value to 0
        df = df.fillna(0)

        export = df.to_csv('Stock Dataset.csv', index=False)
        print('[SUCCESS] Stock Dataset has been exported to the local directory.')

    def normalize_dataset(self, path, index='symbol'):
        '''

        Cluster the dataset and export to a new csv dataset with clustered value.

        @param path: Folder directory of the dataset.
        @param index: Name of the column of index.

        return: None.

        '''

        df = pd.read_csv(path, index_col=index)

        # Remove inf and nan row
        df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

        # Store columns name and index
        columns = df.columns
        index_column = df.index

        df_values = df.values

        # Define scaler
        scaler = StandardScaler()

        df_values_scaled = scaler.fit_transform(df_values)

        df = pd.DataFrame(df_values_scaled, columns=columns, index=index_column)

        export = df.to_csv('Normalized Stock Dataset.csv')
        print('[SUCCESS] Normalized Stock Dataset has been exported to the local directory.')

    def cluster_export(self, path, index='symbol', n_clusters=500):
        '''

        Cluster the dataset and export to a new csv dataset with clustered value.

        @param path: Folder directory of the normalized dataset.
        @param index: Name of the column of index.
        @param n_clusters: Number of cluster

        return: None.

        '''

        df = pd.read_csv(path, index_col=index)

        model = KMeans(n_clusters=n_clusters)

        model = model.fit(df.values)

        # Cluster labels
        labels = model.labels_

        df['cluster'] = labels

        df = df['cluster']

        export = df.to_csv('Clustered Stock Dataset.csv')
        print('[SUCCESS] Normalized Stock Dataset has been exported to the local directory.')


In [186]:
with open('tickers.json', 'r') as f:
    json_tickers = f.read()
    list_tickers = json.loads(json_tickers)

test_tickers = ['AAPL', 'MSFT']

clustering = Clustering(list_tickers)

# Run to gather the data
clustering.gather_data()

[INFO] Gathering AAPL stock data...
[*********************100%***********************]  1 of 1 completed
[INFO] Gathering MSFT stock data...
[*********************100%***********************]  1 of 1 completed
[SUCCESS] Stock Dataset has been exported to the local directory.


In [187]:
df = pd.read_csv('Stock Dataset.csv', index_col='symbol')
print(df.head())

            sector  fullTimeEmployees  trailingAnnualDividendYield  \
symbol                                                               
AAPL    Technology             147000                     0.006668   
MSFT    Technology             163000                     0.009077   

        payoutRatio  averageDailyVolume10Day  trailingAnnualDividendRate  \
symbol                                                                     
AAPL         0.2177                123191000                       0.807   
MSFT         0.3115                 32066483                       2.140   

        averageVolume10days  dividendRate      beta  priceHint  ...  \
symbol                                                          ...   
AAPL              123191000          0.82  1.251354          2  ...   
MSFT               32066483          2.24  0.812567          2  ...   

        trailingEps  lastDividendValue  priceToBook  heldPercentInsiders  \
symbol                                               

In [182]:
clustering.normalize_dataset('Stock Dataset.csv')

df_normalized = pd.read_csv('Normalized Stock Dataset.csv')

print(df_normalized.head())


ValueError: Found array with 0 sample(s) (shape=(0, 36)) while a minimum of 1 is required by StandardScaler.

In [135]:
clustering.cluster_export('Normalized Stock Dataset.csv', n_clusters=50)

[SUCCESS] Normalized Stock Dataset has been exported to the local directory.


In [136]:
df_clustered = pd.read_csv('Clustered Stock Dataset.csv')

print(df_clustered.head())


  symbol  cluster
0      A        0
1     AA        0
2    AAA        3
3   AAAU        3
4   AACG       11


In [151]:
symbol_cluster = df_clustered.loc[df_clustered['symbol'] == 'JPM']['cluster'].values[0]
print(symbol_cluster)


7


In [152]:
same_cluster = df_clustered.loc[df_clustered['cluster'] == symbol_cluster]
print(same_cluster)


     symbol  cluster
692     BAC        7
1181      C        7
1538  CMCSA        7
1723   CSCO        7
3574     HD        7
4094   INTC        7
4374    JNJ        7
4394    JPM        7
4534     KO        7
4863     MA        7
5171    MRK        7
5185     MS        7
5602    NVS        7
5774   ORCL        7
5966    PEP        7
5981    PFE        7
6011     PG        7
7932    UNH        7
8028      V        7
8311     VZ        7
