In [55]:
import pandas as pd
from scipy.io import arff

# Load the ARFF file
data, meta = arff.loadarff('BeetleFly_TRAIN.arff')

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Convert byte strings to regular strings (if needed)
for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].str.decode("utf-8")

# Create a new DataFrame with "id," "values," and "class" columns
new_data = []

# Loop through each row in the original DataFrame
for idx, row in df.iterrows():
    # Extract the first 512 elements
    values = row.iloc[:512].tolist()
    
    # Get the last element as the class value
    class_value = row.iloc[-1]
    
    # Duplicate "id" and "class" for each element in the array
    for value in values:
        new_data.append({"id": idx, "values": value, "class": class_value})

# Create the new DataFrame
train_df = pd.DataFrame(new_data)

train_df["values"] = train_df["values"].apply(lambda x: round(x, 3) if isinstance(x, (int, float)) else x)

# Display the new DataFrame
print(train_df)

test_data, test_meta = arff.loadarff('BeetleFly_TEST.arff')
test_df = pd.DataFrame(test_data)
new_data = []

for idx, row in test_df.iterrows():
    # Extract the first 512 elements
    values = row.iloc[:512].tolist()
    
    # Get the last element as the class value
    class_value = row.iloc[-1]
    
    # Duplicate "id" and "class" for each element in the array
    for value in values:
        new_data.append({"id": idx, "values": value})
# Convert the data to a pandas DataFrame
test_df = pd.DataFrame(new_data)
test_df["values"] = test_df["values"].apply(lambda x: round(x, 3) if isinstance(x, (int, float)) else x)
print(test_df)



       id  values class
0       0   1.247     1
1       0   1.175     1
2       0   1.069     1
3       0   0.964     1
4       0   0.895     1
...    ..     ...   ...
10235  19   0.795     2
10236  19   0.916     2
10237  19   1.037     2
10238  19   1.197     2
10239  19   1.318     2

[10240 rows x 3 columns]
       id  values
0       0   1.740
1       0   1.733
2       0   1.709
3       0   1.633
4       0   1.541
...    ..     ...
10235  19   1.783
10236  19   1.873
10237  19   1.925
10238  19   2.017
10239  19   2.079

[10240 rows x 2 columns]


In [56]:
# Check for null values in the DataFrame
null_values = train_df.isnull()

# Get the count of null values for each column
null_count = null_values.sum()

# Display the count of null values for each column
print(null_count)

id        0
values    0
class     0
dtype: int64


In [57]:
# Check for null values in the DataFrame
null_values = test_df.isnull()

# Get the count of null values for each column
null_count = null_values.sum()

# Display the count of null values for each column
print(null_count)

id        0
values    0
dtype: int64


In [58]:
class1_train_data= train_df[train_df['class']=='1']
display(class1_train_data)

Unnamed: 0,id,values,class
0,0,1.247,1
1,0,1.175,1
2,0,1.069,1
3,0,0.964,1
4,0,0.895,1
...,...,...,...
5115,9,1.300,1
5116,9,1.396,1
5117,9,1.479,1
5118,9,1.610,1


In [59]:
class2_train_data= train_df[train_df['class']=='2']
display(class2_train_data)

Unnamed: 0,id,values,class
5120,10,2.506,2
5121,10,2.479,2
5122,10,2.455,2
5123,10,2.434,2
5124,10,2.421,2
...,...,...,...
10235,19,0.795,2
10236,19,0.916,2
10237,19,1.037,2
10238,19,1.197,2


In [60]:
import plotly.express as px
from scipy.stats import mannwhitneyu

In [61]:
# Create a histogram using Plotly Express
fig = px.histogram(class1_train_data, x='values', nbins=500, title='Histogram of Values change (Class 1)')
fig.show()

In [62]:
# Create a histogram using Plotly Express
fig = px.histogram(class2_train_data, x='values', nbins=500, title='Histogram of Values Change (Class 2)')
fig.show()

In [63]:
column_stats = class1_train_data['values'].describe()
display(column_stats)

count    5120.000000
mean       -0.000001
std         0.999119
min        -2.257000
25%        -0.789250
50%        -0.050500
75%         0.688000
max         2.364000
Name: values, dtype: float64

In [64]:
column_stats = class2_train_data['values'].describe()
display(column_stats)

count    5120.000000
mean       -0.000006
std         0.999124
min        -2.517000
25%        -0.750250
50%        -0.061000
75%         0.691250
max         2.506000
Name: values, dtype: float64

In [66]:
fig = px.violin(train_df, x='class', y='values', box=True, points="all",
                color='class', labels={'class': 'Target Class', 'values': 'Price Changes'},
                title='Distribution of Price Changes by Target Class')
fig.update_traces(marker=dict(size=3))  # Customize marker size
fig.show()

In [68]:
id1_df=train_df[train_df['id']==1];
display(id1_df.head(4))

Unnamed: 0,id,values,class
512,1,0.826,1
513,1,0.881,1
514,1,0.818,1
515,1,0.674,1


In [77]:
id2_df=train_df[train_df['id']==10];
display(id2_df.head(4))

Unnamed: 0,id,values,class
5120,10,2.506,2
5121,10,2.479,2
5122,10,2.455,2
5123,10,2.434,2


In [84]:
lags = range(1, 5)
autocorrelation_results = []

for id in range(1,10):
    id_df=class1_train_data[class1_train_data['id']==id];

    for lag in lags:
        autocorr_value = id_df['values'].autocorr(lag=lag)
        autocorrelation_results.append((id, lag, autocorr_value))

In [85]:
# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['ID', 'Lag', 'Autocorrelation'])
 
# Calculate the mean autocorrelation for each lag across all companies
mean_autocorr_df = autocorr_df.groupby('Lag')['Autocorrelation'].mean().reset_index()

# Create a line plot using Plotly Express
fig = px.line(mean_autocorr_df, x='Lag', y='Autocorrelation', title='Mean Autocorrelation of Change for class 1')
fig.show()


In [86]:
lags = range(1, 5)
autocorrelation_results = []

for id in range(10,20):
    id_df=class2_train_data[class2_train_data['id']==id];

    for lag in lags:
        autocorr_value = id_df['values'].autocorr(lag=lag)
        autocorrelation_results.append((id, lag, autocorr_value))

In [87]:
# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['ID', 'Lag', 'Autocorrelation'])
 
# Calculate the mean autocorrelation for each lag across all companies
mean_autocorr_df = autocorr_df.groupby('Lag')['Autocorrelation'].mean().reset_index()

# Create a line plot using Plotly Express
fig = px.line(mean_autocorr_df, x='Lag', y='Autocorrelation', title='Mean Autocorrelation of Change for class 1')
fig.show()
