In [1]:
import pandas as pd
import numpy as np
import os
import datetime
from sklearn import feature_extraction

In [2]:
def load_time_csv(file_path, iden_str):
    
    file_counter = 0
    for filename in os.listdir(file_path):
        if iden_str in filename:
            if file_counter == 0:
                output_df = pd.read_csv(file_path + '/' + filename, names=['Time', 'Open', 'High', 'Low', 'Close'])
            else:
                output_df = pd.concat([output_df, pd.read_csv(file_path + '/' + filename)], names=['Time', 'Open', 'High', 'Low', 'Close'])
    
    return output_df

In [6]:
def main():
    
    label_df = load_time_csv('Inputs', '_12')
    tweet_df = pd.read_csv('Inputs/trumptweet.csv', encoding='latin-1')
    label_df['Time']= pd.to_datetime(label_df['Time'])
    
    label_time_stamps = list(label_df['Time'])
    closed_number = list(label_df['Close'])
    
    label_time_stamps_dt = []
    for time in label_time_stamps:
        label_time_stamps_dt.append(time.to_pydatetime())
    
    original_tweet_df = tweet_df[tweet_df['is_retweet'] != True].reset_index()
    del original_tweet_df['index']
    
    time_stamp_dict = {}
    
    for idx, row in original_tweet_df.iterrows():
        if row['created_at'] not in time_stamp_dict:
            time_stamp_dict[row['created_at']] = row['text']
        else:
            time_stamp_dict[row['created_at']] = time_stamp_dict[row['created_at']] + " " + row['text']
    
    
    #print(original_tweet_df)
    #print(time_stamp_dict)
    time_stamps = list(time_stamp_dict.keys())
    content = list(time_stamp_dict.values())
    
    selected_time = []
    selected_text = []
    
    
    for idx, time_str in enumerate(time_stamps):
        try:
            time_stamp = datetime.datetime.strptime(time_str, '%m/%d/%y %H:%M')
            if time_stamp in label_time_stamps_dt:
                selected_time.append(time_stamp)
                selected_text.append(content[idx])
                
        except TypeError:
            #print(time_str)
            continue
    
    vectorizer = feature_extraction.text.CountVectorizer()
    X = vectorizer.fit_transform(selected_text)
    
    
    feature_array = X.toarray()
    print(len(vectorizer.get_feature_names()))
    print(feature_array.shape)
    
    label_list = []
    trend_list = []
    
    for time in selected_time:
        idx = label_time_stamps_dt.index(time)
        
        if closed_number[idx] >= closed_number[idx+1]:
            label_list.append(0)
        else:
            label_list.append(1)
            
        trend_list.append(closed_number[idx-10: idx])
            
    trend_array = np.array(trend_list)
    
    X_array = np.concatenate((trend_array, feature_array), axis=1)
    
    total_array = np.concatenate((np.array(label_list).reshape(len(label_list), 1), X_array), axis=1)
    
    np.savetxt("total_array.csv", total_array, delimiter=",", fmt='%f')
    print(total_array.shape)
    
    return 0

In [7]:
if __name__ == "__main__":
    main()

5969
(1859, 5969)
(1859, 5980)
