In [None]:
!pip install seaborn
!pip install scipy

In [None]:
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta
import datetime as dt
from pylab import mpl, plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
from scipy import stats

## Summary statistics (e.g. average, max, min, volatility)
- Trend analysis and visualizations
- Correlation with other indicators (if available)
- Forecasting or modeling
- Custom insights based on your goals

## Data Clean

In [None]:
# Create a list of all year-month combinations from Jan 2015 to Aug 2025
date_range = pd.date_range(start='2015-01-01', end='2025-08-01', freq='MS')

# Generate filenames based on the format
filenames = [f"FMTQIK_{date.year}{date.month:02d}.csv" for date in date_range]

header=1
footer=2
# Read and concatenate all CSVs into a single DataFrame
df = pd.concat([pd.read_csv(filename,skiprows=header, skipfooter=footer,engine='python') for filename in filenames], ignore_index=True)
df = df.dropna(axis=1, how='all')

In [None]:
# Clean and convert relevant columns
columns_to_clean = ['Trade Volume', 'Trade Value', 'Transaction', 'TAIEX']
for col in columns_to_clean:
    df[col] = df[col].str.replace(',', '')
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with NaN in Date, TAIEX, or Trade Volume
df_clean = df.dropna(subset=['Date', 'TAIEX', 'Trade Volume','Trade Value','Transaction'])

## Save to CSV

In [None]:
df_clean.to_csv('FMTQIK_2025-0903.csv')

## Visualization

In [None]:
# Create line chart of TAIEX vs Trade Volume
fig = px.line(df_clean, x='Date', y=['TAIEX'],
              labels={'value': 'Value', 'variable': 'Legend'},
              title='TAIEX Index Over Time')
fig.show()

In [None]:
# Create subplots with 70/30 height ratio
fig = make_subplots(rows=2, cols=1, shared_xaxes=True,
                    vertical_spacing=0.05,
                    row_heights=[0.7, 0.3],
                    subplot_titles=("TAIEX Index Over Time", "Trade Volume Over Time"))

# Add TAIEX line
fig.add_trace(go.Scatter(x=df_clean['Date'], y=df_clean['TAIEX'],
                         mode='lines', name='TAIEX'), row=1, col=1)

# Add Trade Volume line
fig.add_trace(go.Scatter(x=df_clean['Date'], y=df_clean['Trade Volume'],
                         mode='lines', name='Trade Volume'), row=2, col=1)

# Update layout
fig.update_layout(height=800, title_text="TAIEX Index and Trade Volume (70/30 Split)")

fig.show()

## Data Analysis

In [None]:
df_clean.info()
df_clean.describe()

In [None]:
df_clean.corr(numeric_only=True)

In [None]:
sns.pairplot(df_clean[['TAIEX','Trade Volume','Trade Value','Transaction']])

In [None]:
sns.displot(x='Trade Volume', data=df, bins=30)

In [None]:
sns.boxplot(y='Trade Volume', data=df)

In [None]:
sns.scatterplot(x='Trade Value', y='Trade Volume', data=df_clean)

In [None]:
res = stats.linregress(x=df_clean['Trade Value'], y=df_clean['Trade Volume'])
res

In [None]:
sns.regplot(x="Trade Value", y="Trade Volume", data=df_clean, line_kws=dict(color="r"))
plt.xlabel('Trade Value')
plt.ylabel('Trade Volume')
plt.title('Relationship between trade value and volume')

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the CSV file and use the second row as header
df = pd.read_csv("FMTQIK_2025-0903.csv", header=1)

# Rename columns for clarity
df.columns = ['Index', 'Date', 'Trade Volume', 'Trade Value', 'Transaction', 'TAIEX', 'Change']

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Date'])

# Sort by date
df = df.sort_values('Date')

# Prepare data for linear regression
df['Date_ordinal'] = df['Date'].map(pd.Timestamp.toordinal)
X = df[['Date_ordinal']]
y = df['TAIEX']

# Fit linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict TAIEX values using the regression model
df['TAIEX_trend'] = model.predict(X)

# Create plot with actual TAIEX and regression line
fig = px.line(df, x='Date', y=['TAIEX', 'TAIEX_trend'],
              labels={'value': 'TAIEX Index', 'Date': 'Date', 'variable': 'Series'},
              title='TAIEX Index Over Time with Linear Regression Trend Line')

# Show the plot
fig.show()

In [None]:
df.info()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the CSV file with correct header
df = pd.read_csv("FMTQIK_2025-0903.csv", header=1)

# Rename columns for clarity
df.columns = ['Index', 'Date', 'Trade Volume', 'Trade Value', 'Transaction', 'TAIEX', 'Change']

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Date'])

# Sort by date
df = df.sort_values('Date')

# Prepare data for linear regression
df['Date_ordinal'] = df['Date'].map(pd.Timestamp.toordinal)
X = df[['Date_ordinal']]
y = df['TAIEX']

# Fit linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict TAIEX values using the regression model
df['TAIEX_trend'] = model.predict(X)

# Plot using matplotlib
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['TAIEX'], label='TAIEX', color='blue')
plt.plot(df['Date'], df['TAIEX_trend'], label='Linear Regression Trend', color='orange')
plt.xlabel('Date')
plt.ylabel('TAIEX Index')
plt.title('TAIEX Index Over Time with Linear Regression Trend Line')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot
plt.savefig("taiex_regression_plot.png")
plt.show()

In [None]:
import pandas as pd

# Load the CSV file with the correct header
df = pd.read_csv("FMTQIK_2025-0903.csv", header=1)

# Rename columns for clarity
df.columns = ['Index', 'Date', 'Trade Volume', 'Trade Value', 'Transaction', 'TAIEX', 'Change']

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Date'])

# Sort by date to ensure chronological order
df = df.sort_values('Date')

# Get the first and last TAIEX values and corresponding dates
start_date = df['Date'].iloc[0]
end_date = df['Date'].iloc[-1]
start_value = df['TAIEX'].iloc[0]
end_value = df['TAIEX'].iloc[-1]

# Calculate the number of years between start and end dates
years = (end_date - start_date).days / 365.25

# Calculate CAGR (Compound Annual Growth Rate)
cagr = ((end_value / start_value) ** (1 / years)) - 1

# Calculate average annual growth rate
average_annual_growth = (end_value - start_value) / years

# Display the results
print(f"Start Date: {start_date.date()}, TAIEX: {start_value}")
print(f"End Date: {end_date.date()}, TAIEX: {end_value}")
print(f"Years: {years:.2f}")
print(f"Annualized Growth Rate (CAGR): {cagr * 100:.2f}%")
print(f"Average Annual Growth: {average_annual_growth:.2f} points/year")

In [None]:
from datetime import datetime
start_date = datetime(2014, 3, 1, 0, 0, 0)
end_date = datetime(2025, 10, 5, 0, 0, 0)
start_value = 1478
end_value = 3430

# Calculate the number of years between start and end dates
years = (end_date - start_date).days / 365.25

# Calculate CAGR (Compound Annual Growth Rate)
cagr = ((end_value / start_value) ** (1 / years)) - 1

# Calculate average annual growth rate
average_annual_growth = (end_value - start_value) / years

# Display the results
print(f"Start Date: {start_date.date()}, TAIEX: {start_value}")
print(f"End Date: {end_date.date()}, TAIEX: {end_value}")
print(f"Years: {years:.2f}")
print(f"Annualized Growth Rate (CAGR): {cagr * 100:.2f}%")
print(f"Average Annual Growth: {average_annual_growth:.2f} W/year")