In [2]:
API_KEY=''
filings = '/content/drive/MyDrive/BERT-SEC'

In [50]:
import pandas as pd
import numpy as np
import requests
import time
import os

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.express as px



In [5]:
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
cik_codes_json = requests.get('https://www.sec.gov/files/company_tickers.json', headers=headers).json()
companies = pd.DataFrame.from_dict(cik_codes_json.values())

def get_industry(ticker):
  r = requests.get('https://www.alphavantage.co/query?function=OVERVIEW&symbol={s}&apikey={k}'.format(s=ticker, k=API_KEY))
  return r.json()

def get_ticker(cik):
  ticker = companies[companies['cik_str'] == cik]['ticker']
  return ticker

def apply_ticker_get_industry():
  ciks = [int(i.split('_')[0]) for i in os.listdir(filings)]
  mapping = []
  for c in ciks:
    t = str(get_ticker(c).iloc[0])
    mapping.append(get_industry(t))
  return mapping

In [21]:
#
# Joins all the datasets. There are
#
def join_data():
  files = [f for f in os.listdir(filings)]
  df_array = [pd.read_csv(filings + '/' + f) for f in files]
  all_columns = list(set().union(*(df.columns for df in df_array)))
  filled_dataframes = [df.reindex(columns=all_columns, fill_value=0) for df in df_array]
  stacked_df = pd.concat(filled_dataframes, ignore_index=True)

  return stacked_df

#
# Select features
# params:
#   - threshold: The % of zeros that a feature can hold maximum
#
def get_features(df, threshold):
  threshold = len(df) * threshold
  zero_counts = (df == 0).sum()
  columns = zero_counts[zero_counts <= threshold].index.to_list()
  return df[columns]

In [60]:
def perform_pca_select_features_df(df, n_components=None):
  X = df.values

  pca = PCA(n_components=n_components)
  X_pca = pca.fit_transform(X)

  explained_var = pca.explained_variance_ratio_

  cumulative_var = 0
  selected_components = 0
  for i, var in enumerate(explained_var):
      cumulative_var += var
      if cumulative_var >= 0.95:
          selected_components = i + 1
          break

  selected_X = X_pca[:, :selected_components]

  return selected_X

In [64]:
def build_lof_model(data, n_neighbors=20, contamination=0.1):
  scaler = StandardScaler()
  scaled_data = scaler.fit_transform(data)

  lof_model = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)

  lof_model.fit(scaled_data)

  return lof_model, scaler

In [72]:
def evaluate_lof_silhouette(lof_model, dataframe):
  lof_scores = -lof_model.negative_outlier_factor_

  silhouette = silhouette_score(dataframe, lof_scores)

  return silhouette

In [65]:
def visualize_lof_scores(lof_model, dataframe):
  lof_scores = -lof_model.negative_outlier_factor_
  plt.scatter(range(len(dataframe)), lof_scores, c='blue', s=20, label='LOF scores')
  plt.xlabel('Data Points')
  plt.ylabel('LOF Scores')
  plt.title('Local Outlier Factor (LOF) Scores')
  plt.legend()
  plt.show()

In [1]:
def cluster_build_plot(df_p, n_clusters, features):
    X = df_p[features]
    kmeans = KMeans(
        n_clusters=n_clusters,
        init='k-means++',
        n_init= 35,
        max_iter=300,
        tol=1e-4,
        verbose=0,
        random_state=None,
        copy_x=True,
        algorithm='elkan')
    kmeans.fit(X)

    labels = kmeans.predict(X)

    df['Cluster'] = labels

    if len(features) == 2:
        plt.figure(figsize=(8, 6))
        plt.scatter(df[features[0]], df[features[1]], c=labels, cmap='viridis', marker='o', edgecolors='black')
        plt.title('Cluster Plot')
        plt.xlabel(features[0])
        plt.ylabel(features[1])
        plt.colorbar(label='Cluster')
        plt.show()
    else:
        print("Plotting is available only for 2D data.")


In [2]:
def cluster_build_plot_3d(df_p, n_clusters, features):
    X = df_p[features]

    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X)

    labels = kmeans.predict(X)

    df['Cluster'] = labels

    fig = px.scatter_3d(df, x=features[0], y=features[1], z=features[2], color='Cluster', symbol='Cluster')
    fig.update_layout(title='Cluster Plot in 3D', scene=dict(
        xaxis_title=features[0],
        yaxis_title=features[1],
        zaxis_title=features[2]
    ))
    fig.show()


In [3]:
def top_correlated_pairs(df_p, n):
    corr_matrix = df_p.corr().abs()
    mask = ~pd.np.eye(corr_matrix.shape[0], dtype=bool)

    upper_tri = corr_matrix.where(mask)
    stacked_corr = upper_tri.stack().sort_values(ascending=False)
    unique_pairs = stacked_corr[stacked_corr.index.map(lambda x: x[0] < x[1])]
    top_pairs = unique_pairs.head(n)

    return top_pairs