In [24]:
import numpy as np
import glob
import pandas as pd

In [15]:
#step 1 - combining all daily files into one file
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [29]:
# setting df as a DataFrame as combined_csv
#all data from the .csv is in strings it appears
df = pd.read_csv("combined_csv.csv")

#drop blanks from the ticker
df['ticker'] = df['ticker'].str.strip()
df['ticker'].replace('', np.nan, inplace=True)
df['ticker'].replace('TRUE', np.nan, inplace=True)
#drop ticker.str.len() > 6 from ticker (no ticker is longer than 6)
df['ticker'].values[df['ticker'].str.len() > 6,] = np.nan
df.dropna(subset=['ticker'], inplace=True)

#drop blanks and 0s from price
df['price'] = df['price'].str.strip() #strip blank spaces
df['price'].replace('', np.nan, inplace=True) #replace blanks with NaN
df['price'] = df['price'].replace(',','', regex=True) #strip commas
df['price'] = pd.to_numeric(df['price']) #convert price column from string to numbers
df['price'].replace(0,np.nan,inplace=True) #remove 0s
df.dropna(subset=['price'], inplace=True) #remove all NaNs

#converting num_mentions, pos_count, neg_count, price, price_change_net into int/floats
df['num_mentions'] = pd.to_numeric(df['num_mentions'])
df['pos_count'] = pd.to_numeric(df['pos_count'])
df['neg_count'] = pd.to_numeric(df['neg_count'])
df['price'] = pd.to_numeric(df['price'])
df['price_change_net'] = df['price_change_net'].replace(',','', regex=True) #strip commas
df['price_change_net'] = pd.to_numeric(df['price_change_net'])

#converting date column into date_time format
df['date'] = pd.to_datetime(df['date'])

#cleaning pct_mentions and converting to number
df['pct_mentions'] = df['pct_mentions'].str.strip() #strip blank spaces
df['pct_mentions'] = df['pct_mentions'].replace('<1','0.5', regex=True) #change '<1' to '0.5'
df['pct_mentions'] = pd.to_numeric(df['pct_mentions'])

# cleaning the price_change_pct column
df.price_change_net = df.price_change_net.astype(float)
df['price_change_pct'] = df['price_change_pct'].str.strip() #strip blank spaces
df['price_change_pct'] = df['price_change_pct'].str.replace('%','') #strip %
df['price_change_pct'] = df['price_change_pct'].str.replace(',','', regex=True) #strip ,
df['price_change_pct'] = df['price_change_pct'].str.replace('+','') #strip
df['price_change_pct'] = df['price_change_pct'].str.replace('undefined','') #strip
df.dropna(subset=['price_change_pct'], inplace=True)
df['price_change_pct'] = pd.to_numeric(df['price_change_pct']) #convert to integer

# converting into floats then into % for relevant columns
df['bullish_pct'] = df.bullish_pct/100
df['bearish_pct'] = df.bearish_pct/100
df['neutral_pct'] = df.neutral_pct/100
df['pct_mentions'] = df.pct_mentions/100

# #set index to ticker
df = df.set_index('ticker')

In [30]:
# create new cleaned .csv file
df.to_csv('WSB.csv')

In [50]:
#seperate way to clean data

# Convert this column to float
df.price_change_net = df.price_change_net.astype(float)

# Define a function to remove some characters in strings in a series
def remove_from_str(series, *args):
    for val in args:
        series = series.str.replace(val, '')
    
    return series

# Check rows that have price_change_pct == 'undefined'
df.price_change_pct = remove_from_str(df.price_change_pct, '+', ',', '%')

#check this again
df[df.price_change_pct == 'undefined']

# Only 8 rows, drop them
df = df[~(df.price_change_pct == 'undefined')]
# Convert to float
df['price_change_pct'] = df.price_change_pct.astype(float)
# Convert pct values to actual percents so it's not confusing (after we remove %)
df['price_change_pct'] = df.price_change_pct/100

In [18]:
df

Unnamed: 0_level_0,date,url,num_mentions,pct_mentions,pos_count,neg_count,bullish_pct,bearish_pct,neutral_pct,price,price_change_net,price_change_pct,time_of_price
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
GME,2020-12-28,https://finance.yahoo.com/quote/GME?p=GME,567,0.20000,257,131,0.45,0.2300,0.3100,20.99,0.84,4.1700,At close: 4:00PM EST
PLTR,2020-12-28,https://finance.yahoo.com/quote/PLTR?p=PLTR,418,0.14000,164,110,0.39,0.2600,0.3400,25.63,-2.12,-7.6400,At close: 4:00PM EST
PSTH,2020-12-28,https://finance.yahoo.com/quote/PSTH?p=PSTH,80,0.02000,51,19,0.63,0.2300,0.1200,25.99,-0.04,-0.1500,At close: 4:00PM EST
PSA,2020-12-28,https://finance.yahoo.com/quote/PSA?p=PSA,3,0.00500,0,0,0.00,0.0000,1.0000,229.14,0.29,0.1300,At close: 4:00PM EST
TD,2020-12-28,https://finance.yahoo.com/quote/TD?p=TD,9,0.00500,2,1,0.22,0.1100,0.6600,56.04,-0.04,-0.0700,At close: 4:00PM EST
...,...,...,...,...,...,...,...,...,...,...,...,...,...
BCE,2021-02-10,https://finance.yahoo.com/quote/BCE?p=BCE,4,0.00005,4,0,0.01,0.0000,0.0000,43.63,-0.02,-0.0005,At close: 4:00PM EST
RCI,2021-02-10,https://finance.yahoo.com/quote/RCI?p=RCI,1,0.00005,1,0,0.01,0.0000,0.0000,45.92,-0.82,-0.0175,At close: 4:00PM EST
DSL,2021-02-10,https://finance.yahoo.com/quote/DSL?p=DSL,1,0.00005,1,0,0.01,0.0000,0.0000,17.56,0.12,0.0069,At close: 4:00PM EST
SA,2021-02-10,https://finance.yahoo.com/quote/SA?p=SA,1,0.00005,0,1,0.00,0.0100,0.0000,19.20,0.06,0.0031,At close: 4:00PM EST


In [23]:
df.dtypes

date                datetime64[ns]
url                         object
num_mentions                 int64
pct_mentions               float64
pos_count                    int64
neg_count                    int64
bullish_pct                float64
bearish_pct                float64
neutral_pct                float64
price                      float64
price_change_net           float64
price_change_pct           float64
time_of_price               object
dtype: object