In [1]:
import pandas as pd
import glob
import re

# Step 1: Load and concatenate CSV files
file_paths = glob.glob('E:\RWTH\Sem2\IMES\IMES GROUP 3\For ML\Group*_Case*_fft.csv')
dataframes = []

for file in file_paths:
    # Extract group and case numbers using regex
    match = re.search(r'Group(\d+)_Case(\d+)_fft', file)
    if match:
        group_number = int(match.group(1))
        case_number = int(match.group(2))

        # Load the CSV file
        df = pd.read_csv(file)
        
        # Add group and case number columns
        df['Group number'] = group_number
        df['Case number'] = case_number

        dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Step 2: Create a metadata DataFrame with additional location data
metadata = {
    'Group number': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9],
    'Case number': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
    'Frequency': [0, 40, 40, 40, 40, 0, 30, 30, 40, 30, 0, 35, 35, 40, 35, 0, 40, 40, 40, 40, 0, 45, 45, 40, 45, 0, 50, 50, 40, 50, 0, 30, 30, 40, 30, 0, 35, 35, 40, 35, 0, 40, 40, 40, 40, 0, 45, 45, 40, 45],
    'Damping': [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ],
    'Inclination': [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
    'Frequency Location': ['0', '6', '5', '6', '6', '0', '6', '3', '6', '6', '0', '6', '4', '6', '6', '0', '6', '5', '6', '6', '0', '6', '3', '6', '6', '0', '6', '4', '6', '6', '0', '6', '5', '6', '6', '0', '6', '3', '6', '6', '0', '6', '4', '6', '6', '0', '6', '5', '6', '6',],
    'Damping Location': ['0', '0', '0', '0', '3', '0', '0', '0', '0', '3&4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '3', '0', '0', '0', '0', '3&4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '3', '0', '0', '0', '0', '3&4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '3'],
    'Inclination Location': ['0', '0', '0', '3', '0', '0', '0', '0', '3', '0', '0', '0', '0', '4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '4', '0', '0', '0', '0', '3', '0', '0', '0', '0', '4', '0', '0', '0', '0', '3', '0'],
}

# Step 3: Convert metadata to DataFrame
metadata_df = pd.DataFrame(metadata)

# Step 4: Merge the combined DataFrame with the metadata DataFrame
merged_df = pd.merge(combined_df, metadata_df, on=['Group number', 'Case number'], how='left')

# Step 5: Drop the Group number and Case number columns from metadata
merged_df = merged_df.drop(columns=['Group number', 'Case number'])

# Inspect the merged DataFrame
print(merged_df.head())


       Time  AccX_Filtered  AccY_Filtered  AccZ_Filtered  GroupNumber  \
0  0.156307       0.432041      -0.194161      -0.447347            0   
1  0.173208       0.242370      -0.371198      -0.158538            0   
2  0.190098      -0.157650      -0.447262       0.175869            0   
3  0.206996      -0.454708      -0.323170       0.252963            0   
4  0.223897      -0.459705      -0.318585       0.215705            0   

   CaseNumber  Frequency  Damping  Inclination Frequency Location  \
0           1          0        0            0                  0   
1           1          0        0            0                  0   
2           1          0        0            0                  0   
3           1          0        0            0                  0   
4           1          0        0            0                  0   

  Damping Location Inclination Location  
0                0                    0  
1                0                    0  
2                0  

## Check for Freq loc, Damping loc and inclination loc as one hot encoded?
## PCA use-> AccX and AccY
## Use Ideal case time stamp to get failure case time stamp
## 

In [2]:
# Print the first 50 rows
print(merged_df.head(50))

# Display the entire DataFrame
display(merged_df)

        Time  AccX_Filtered  AccY_Filtered  AccZ_Filtered  GroupNumber  \
0   0.156307       0.432041      -0.194161      -0.447347            0   
1   0.173208       0.242370      -0.371198      -0.158538            0   
2   0.190098      -0.157650      -0.447262       0.175869            0   
3   0.206996      -0.454708      -0.323170       0.252963            0   
4   0.223897      -0.459705      -0.318585       0.215705            0   
5   0.240800      -0.208246      -0.427823       0.169884            0   
6   0.257696       0.098498      -0.363969      -0.093135            0   
7   0.274591       0.242413      -0.292537      -0.246291            0   
8   0.291473       0.168263      -0.372872      -0.141603            0   
9   0.308365      -0.072245      -0.395856       0.053679            0   
10  0.325273      -0.286135      -0.362507       0.176797            0   
11  0.342168      -0.320260      -0.382606       0.167228            0   
12  0.359070      -0.184411      -0.39

Unnamed: 0,Time,AccX_Filtered,AccY_Filtered,AccZ_Filtered,GroupNumber,CaseNumber,Frequency,Damping,Inclination,Frequency Location,Damping Location,Inclination Location
0,0.156307,0.432041,-0.194161,-0.447347,0,1,0,0,0,0,0,0
1,0.173208,0.242370,-0.371198,-0.158538,0,1,0,0,0,0,0,0
2,0.190098,-0.157650,-0.447262,0.175869,0,1,0,0,0,0,0,0
3,0.206996,-0.454708,-0.323170,0.252963,0,1,0,0,0,0,0,0
4,0.223897,-0.459705,-0.318585,0.215705,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
554890,51.339516,-0.167590,-0.555367,-0.482142,9,5,45,1,0,6,3,0
554891,51.349475,0.178082,0.464248,0.453902,9,5,45,1,0,6,3,0
554892,51.359434,-0.230599,-0.427171,-0.316466,9,5,45,1,0,6,3,0
554893,51.369393,0.258075,0.318407,0.127332,9,5,45,1,0,6,3,0


In [3]:
# Jumble the DataFrame
shuffled_df = merged_df.sample(frac=1).reset_index(drop=True)

# Print the jumbled DataFrame
print(shuffled_df.head(20))  # Adjust the number to display more or fewer rows

         Time  AccX_Filtered  AccY_Filtered  AccZ_Filtered  GroupNumber  \
0   17.630654       0.753080       1.898469       0.272423            4   
1   38.467330       0.106982       0.273702       0.268129            1   
2    8.872742       0.295118       0.520120      -0.037766            4   
3   28.557899      -0.655854       0.184223      -0.367029            8   
4    1.018207       0.069379      -0.343729      -0.747371            2   
5   23.391612       0.290008       0.351640       0.940208            4   
6   41.056217      -0.022430      -0.070100       0.327643            0   
7   12.837902      -2.535161       0.688015       2.209633            1   
8   24.795045       0.473734       0.725004      -2.293274            6   
9   41.683306      -0.072203       0.046947      -0.309879            6   
10  24.278482      -0.303502       0.120159       0.823140            8   
11  43.941223      -0.139976      -0.110313       0.024735            6   
12  37.124734       0.014

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Step 1: Load your data (assuming shuffled_df from previous steps)
# Here, I'm using the shuffled_df from your previous steps.
# If you have the dataframe ready, use it directly

# Drop irrelevant columns
columns_to_exclude = ['GroupNumber', 'CaseNumber']  # Keep 'Time' and categorical locations
data = shuffled_df.drop(columns=columns_to_exclude)

# Handle categorical columns
categorical_columns = ['Damping Location', 'Inclination Location', 'Frequency Location']
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(merged_df[categorical_columns].replace({'3&4': '34'})).toarray()

# Create DataFrame for encoded columns
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_columns))

# Drop original categorical columns and concatenate encoded columns
merged_df = merged_df.drop(columns=categorical_columns)
merged_df = pd.concat([merged_df, encoded_categorical_df], axis=1)

# Normalize the data
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(merged_df.drop(columns=['Time', 'GroupNumber', 'CaseNumber']))
scaled_df = pd.DataFrame(scaled_features, columns=merged_df.columns.drop(['Time', 'GroupNumber', 'CaseNumber']))

# Add back the 'Time' column (if needed for sequencing)
scaled_df['Time'] = merged_df['Time'].values

# Reorder columns if necessary
scaled_df = scaled_df[['Time'] + [col for col in scaled_df.columns if col != 'Time']]

# Assume Frequency as the target for simplicity; you can change it as needed
target = merged_df['Frequency']

# Step 3: Create sequences for LSTM input
sequence_length = 10  # You can change this length based on your data

def create_sequences(data, target, sequence_length):
    sequences = []
    target_sequences = []
    for i in range(len(data) - sequence_length):
        seq = data[i:i+sequence_length]
        label = target[i+sequence_length]
        sequences.append(seq)
        target_sequences.append(label)
    return np.array(sequences), np.array(target_sequences)

X, y = create_sequences(scaled_df.values, target.values, sequence_length)

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define the LSTM model with modifications
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(sequence_length, X.shape[2])))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, kernel_regularizer='l2'))

# Use Adam optimizer with a lower learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Step 6: Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Step 7: Evaluate the model
loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")

# Step 8: Make predictions
y_pred = model.predict(X_test)

# Plot the learning curves
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

Epoch 1/100
 2514/11098 [=====>........................] - ETA: 1:16 - loss: 140.3011