In [38]:
!pip install yfinance
!pip install pandas_datareader

Collecting pandas_datareader
  Downloading pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.9.0


In [1]:
import warnings

import yfinance as yf
from pandas_datareader import data as pdr

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree
from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('all-trans-5-4-2021.csv')

In [3]:
drops = ['asset_description','comment','ptr_link']
columns_list = data.columns.values.tolist() #list of all columns
for i in range(len(columns_list)): #columns in table
    if columns_list[i] in drops: #if one of the variables we don't care about
        data.pop(columns_list[i]) #remove it

In [16]:
data.drop(data[data['asset_type'] != "Stock"].index, inplace = True) #remove all non stocks from the list of assets
data.drop(data[data['type'] != "Purchase"].index, inplace = True) #remove all non purchases from the list of types

In [5]:
def splice_date(date):
    date_to_list = list(date)
    date_to_string = ""
    for i in range(len(date_to_list)):
        if date_to_list[i] == '/':
            date_to_list[i] = '-'
    for i in range(len(date_to_list)):
        date_to_string = date_to_string + date_to_list[i]
    return date_to_string

In [8]:
test = data[2400:]

In [9]:
def get_timelines():
    
    warnings.filterwarnings('ignore')
    
    trans_prices = []
    two_month_changes = [] #lists of the changes over each time period
    four_month_changes = []
    eight_month_changes = []
    one_year_changes = []
    
    i = 0
    
    for r,c in test.iterrows(): #iterate over the cases
        if i % 100 == 0:
            print(str(i)+" entries have been analyzed.")
        date = splice_date(c['transaction_date']) #index into the transaction date variable and splice it to work within the api
        ticker = c['ticker'] #grab the ticker symbol
        try: 
            stock_data = pdr.get_data_yahoo(ticker, start=date)
            trans_price = round(stock_data.iloc[0,2],3)
            two_month_change = round(stock_data.iloc[60,5] - stock_data.iloc[0,2],3)
            four_month_change = round(stock_data.iloc[120,5] - stock_data.iloc[0,2],3)
            eight_month_change = round(stock_data.iloc[240,5] - stock_data.iloc[0,2],3)
            one_year_change = round(stock_data.iloc[365,5] - stock_data.iloc[0,2],3)
            trans_prices.append(trans_price)
            two_month_changes.append(two_month_change)
            four_month_changes.append(four_month_change)
            eight_month_changes.append(eight_month_change)
            one_year_changes.append(one_year_change)
        except:
            NA = 'NA'
            trans_prices.append(NA)
            two_month_changes.append(NA)
            four_month_changes.append(NA)
            eight_month_changes.append(NA)
            one_year_changes.append(NA)
        i+=1
        if i == test.shape[0]:
            break
    
    test['_transaction_price'] = trans_prices
    test['_60_days'] = two_month_changes
    test['_120_days'] = four_month_changes
    test['_240_days'] = eight_month_changes
    test['_365_days'] = one_year_changes
    
    
    test.drop(test[test['_transaction_price'] == "NA"].index, inplace = True) #remove all assets that don't have a value so they arent searched for in the future
    test.drop(test[test['_60_days'] == "NA"].index, inplace = True) #remove all assets that don't have a value so they arent searched for in the future
    test.drop(test[test['_120_days'] == "NA"].index, inplace = True) #remove all assets that don't have a value so they arent searched for in the future
    test.drop(test[test['_240_days'] == "NA"].index, inplace = True) #remove all assets that don't have a value so they arent searched for in the future
    test.drop(test[test['_365_days'] == "NA"].index, inplace = True) #remove all assets that don't have a value so they arent searched for in the future
    
    print("Done with Data Gathering")

In [10]:
get_timelines()

0 entries have been analyzed.
100 entries have been analyzed.
200 entries have been analyzed.
300 entries have been analyzed.
400 entries have been analyzed.
500 entries have been analyzed.
600 entries have been analyzed.
700 entries have been analyzed.


In [12]:
test.head(20)
test.shape

(456, 12)

In [13]:
test.to_csv('500_trans_with_labels.csv')