## Load and Transform Soybean Data


In [1]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt

### Extract CSVs into DataFrames

In [2]:
soybean_file = "./US Soybeans Futures Historical Data.csv"
#convert csv to data frame and convert column values to float
soybean_df = pd.read_csv(soybean_file, thousands=',', decimal='.')
soybean_df.head()


Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,14-Oct-19,939.75,941.62,945.38,930.62,-,0.57%
1,11-Oct-19,934.38,924.12,939.12,923.62,-,1.29%
2,10-Oct-19,922.5,919.38,933.62,918.88,-,0.03%
3,9-Oct-19,922.25,918.88,931.38,918.5,-,0.33%
4,8-Oct-19,919.25,915.62,923.62,910.5,-,0.38%


### Transform Soybean DataFrame

In [3]:
#convert date from object to a date 
soybean_df['Date'] =  pd.to_datetime(soybean_df['Date'],format='%d-%b-%y')


In [4]:
soybean_df.dtypes

Date        datetime64[ns]
Price              float64
Open               float64
High               float64
Low                float64
Vol.                object
Change %            object
dtype: object

In [5]:
#calculate average soybean price and add it as a new column  sing high and low values 
soybean_df["soybean_avg_price"] = soybean_df[["High", "Low"]].mean(axis=1)
soybean_df.head()



Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,soybean_avg_price
0,2019-10-14,939.75,941.62,945.38,930.62,-,0.57%,938.0
1,2019-10-11,934.38,924.12,939.12,923.62,-,1.29%,931.37
2,2019-10-10,922.5,919.38,933.62,918.88,-,0.03%,926.25
3,2019-10-09,922.25,918.88,931.38,918.5,-,0.33%,924.94
4,2019-10-08,919.25,915.62,923.62,910.5,-,0.38%,917.06


In [6]:
# Create a filtered dataframe from specific columns
soybean_cols = ["Date", "Open", "soybean_avg_price"]
soybean_transformed= soybean_df[soybean_cols].copy()

# Rename the column headers
soybean_transformed = soybean_transformed.rename(columns={"Open": "Soybean_Open_Price",
                                                          "soybean_avg_price": "Soybean_Avg_Price"
                                                       })
soybean_transformed.count()

Date                  2638
Soybean_Open_Price    2638
Soybean_Avg_Price     2638
dtype: int64

In [13]:
# Clean the data by dropping duplicates and setting the index
#soybean_transformed.drop_duplicates("Date", inplace=True)
#premise_transformed.set_index("id", inplace=True)
soybean_transformed.head()

Unnamed: 0,Date,Soybean_Open_Price,Soybean_Avg_Price
0,2019-10-14,941.62,938.0
1,2019-10-11,924.12,931.37
2,2019-10-10,919.38,926.25
3,2019-10-09,918.88,924.94
4,2019-10-08,915.62,917.06


In [20]:
#export to csv

soybean_transformed.to_csv("../../output/soybean_transformed.csv",index=False)