In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# Load the data
file_path = '/Users/naveenashish/Downloads/repo10fb1_2.csv'
data = pd.read_csv(file_path)

In [5]:
data, _ = train_test_split(data, test_size=0.9, stratify=data['faultbasis'])
len(data)

8143

### Vectorize and scale the features

In [6]:
# Define columns for preprocessing
numerical_features = ['modifications_count', 'additions_count', 'deletions_count', 'author_id', 'committer_id', 'hour', 'day', 'repo_id', 'ri']
categorical_features = ['author_name', 'author_login', 'author_email', 'committer_name', 'committer_login', 'committer_email', 'commit_msg', 'parent_shas', 'faulty_commit', 'ext']

# Create a pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if any
    ('scaler', StandardScaler())                 # Standardize features
])


# Create a pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NotAvailable')),  # Fill missing with "NotAvailable"
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))      # One-hot encode
])
# Combine pipelines into a column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Prepare the features (X) and target (y)
X = data[numerical_features + categorical_features]
y = data['faultbasis'].astype(int)

# Apply the preprocessing
X_transformed = preprocessor.fit_transform(X)

# Scale the combined features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)

# Convert the processed data to a DataFrame for review (optional)
X_scaled_df = pd.DataFrame(X_scaled)

# Display the shape of the processed data and the first few rows to verify
print("Processed and scaled data shape:", X_scaled.shape)
X_scaled_df.head()


Processed and scaled data shape: (8143, 25636)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25626,25627,25628,25629,25630,25631,25632,25633,25634,25635
0,-0.257403,0.0,-0.035402,-0.376509,-0.439118,-0.88281,1.049527,0.0,0.0,-0.011082,...,-0.011082,-0.011082,-0.027155,-0.011082,-0.097062,-0.060809,-0.029332,-0.011082,-0.022169,-0.015674
1,-0.257403,0.0,-0.022518,-0.403969,-0.465997,0.998365,0.545291,0.0,0.0,-0.011082,...,-0.011082,-0.011082,-0.027155,-0.011082,-0.097062,-0.060809,-0.029332,-0.011082,-0.022169,-0.015674
2,-0.257403,0.0,-0.037259,-0.403969,-0.465997,0.810248,0.041055,0.0,0.0,-0.011082,...,-0.011082,-0.011082,-0.027155,-0.011082,-0.097062,-0.060809,-0.029332,-0.011082,-0.022169,-0.015674
3,-0.257403,0.0,-0.023678,-0.403969,2.324673,-1.259045,0.041055,0.0,0.0,-0.011082,...,-0.011082,-0.011082,-0.027155,-0.011082,-0.097062,-0.060809,-0.029332,-0.011082,-0.022169,-0.015674
4,-0.257403,0.0,-0.036679,-0.403969,-0.465997,0.810248,0.545291,0.0,0.0,-0.011082,...,-0.011082,-0.011082,-0.027155,-0.011082,-0.097062,-0.060809,-0.029332,-0.011082,-0.022169,-0.015674


### Create Tensors for DNNs

In [7]:
# Convert preprocessed data to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

### Build an elementary DNN and evaluate

In [8]:
# Check unique values in the target to verify the range
print("Unique values in the target:", y_tensor.unique())

# Update num_classes based on the unique values
num_classes = y_tensor.max().item() + 1
print("Number of classes:", num_classes)

# Split the data into training and testing sets
train_size = int(0.8 * len(X_tensor))
test_size = len(X_tensor) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(TensorDataset(X_tensor, y_tensor), [train_size, test_size])

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define a simple 2-layer DNN
class SimpleDNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleDNN, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.layer2(x)
        return x

# Get the input size
input_size = X_tensor.shape[1]

# Initialize the model, loss function, and optimizer
model = SimpleDNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Unique values in the target: tensor([1, 2])
Number of classes: 3
Epoch [1/20], Loss: 0.6044
Epoch [2/20], Loss: 0.8444
Epoch [3/20], Loss: 0.1497
Epoch [4/20], Loss: 0.0715
Epoch [5/20], Loss: 0.0435
Epoch [6/20], Loss: 0.0258
Epoch [7/20], Loss: 0.0164
Epoch [8/20], Loss: 0.0118
Epoch [9/20], Loss: 0.0091
Epoch [10/20], Loss: 0.0077
Epoch [11/20], Loss: 0.0068
Epoch [12/20], Loss: 0.0060
Epoch [13/20], Loss: 0.0056
Epoch [14/20], Loss: 0.0052
Epoch [15/20], Loss: 0.0049
Epoch [16/20], Loss: 0.0049
Epoch [17/20], Loss: 0.0046
Epoch [18/20], Loss: 0.0044
Epoch [19/20], Loss: 0.0043
Epoch [20/20], Loss: 0.0042
Test Accuracy: 92.94%


### Accuracy is a misleading metric, we really interested in the prediction of the sparse target class = 1 

In [9]:
# Evaluate the model and collect predictions and true labels
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.numpy())
        all_labels.extend(y_batch.numpy())

# Print the precision, recall, and F1-score per class
print(classification_report(all_labels, all_preds, digits=4))


              precision    recall  f1-score   support

           1     0.0714    0.0909    0.0800        55
           2     0.9679    0.9587    0.9633      1574

    accuracy                         0.9294      1629
   macro avg     0.5197    0.5248    0.5216      1629
weighted avg     0.9377    0.9294    0.9335      1629



#### Prepare sequences for RNN (though I wouldn't give away everything on the many ways to sequence this !)

In [10]:
# Convert timestamps to datetime and sort data
data['author_date'] = pd.to_datetime(data['author_date'])
data.sort_values(by='author_date', inplace=True)

# Feature engineering: Calculate time differences in seconds
data['time_diff'] = data['author_date'].diff().dt.total_seconds().fillna(0)

# Extract additional time-based features
data['day_of_week'] = data['author_date'].dt.dayofweek
data['hour_of_day'] = data['author_date'].dt.hour

# Prepare the features and target
features = ['modifications_count', 'additions_count', 'deletions_count', 'hour_of_day', 'day_of_week', 'time_diff']
target = 'faultbasis'

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(data[features])
y = data[target].values

# Prepare sequences for RNN
sequence_length = 10  # Adjust based on your needs
X_sequences = []
y_sequences = []

for i in range(len(X) - sequence_length):
    X_sequences.append(X[i:i+sequence_length])
    y_sequences.append(y[i+sequence_length])

X_sequences = torch.tensor(X_sequences, dtype=torch.float32)
y_sequences = torch.tensor(y_sequences, dtype=torch.long)

# Define a PyTorch dataset and dataloader
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = SequenceDataset(X_sequences, y_sequences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

  X_sequences = torch.tensor(X_sequences, dtype=torch.float32)


In [13]:
# Convert sequences to numpy arrays before torch tensors
X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

X_sequences = torch.tensor(X_sequences, dtype=torch.float32)
y_sequences = torch.tensor(y_sequences, dtype=torch.long)

# Check unique values in the target to verify the range
print("Unique values in the target:", np.unique(y_sequences))

# Update num_classes based on the unique values
num_classes = y_sequences.max().item() + 1

print("Number of classes:", num_classes)

# Define an RNN model with 2 LSTM layers
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last output for prediction
        return out

# Initialize the model with the correct number of classes
input_size = X_sequences.shape[2]
hidden_size = 64
model = RNNModel(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the RNN
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in dataloader:
        outputs = model(X_batch)
        
        # Ensure y_batch is in the correct range
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in dataloader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.numpy())
        all_labels.extend(y_batch.numpy())

# Print the precision, recall, and F1-score per class
print(classification_report(all_labels, all_preds, digits=4))

Unique values in the target: [1 2]
Number of classes: 3
Epoch [1/20], Loss: 0.2005
Epoch [2/20], Loss: 0.1393
Epoch [3/20], Loss: 0.1386
Epoch [4/20], Loss: 0.1370
Epoch [5/20], Loss: 0.1361
Epoch [6/20], Loss: 0.1364
Epoch [7/20], Loss: 0.1363
Epoch [8/20], Loss: 0.1371
Epoch [9/20], Loss: 0.1369
Epoch [10/20], Loss: 0.1366
Epoch [11/20], Loss: 0.1399
Epoch [12/20], Loss: 0.1355
Epoch [13/20], Loss: 0.1370
Epoch [14/20], Loss: 0.1339
Epoch [15/20], Loss: 0.1360
Epoch [16/20], Loss: 0.1326
Epoch [17/20], Loss: 0.1328
Epoch [18/20], Loss: 0.1322
Epoch [19/20], Loss: 0.1309
Epoch [20/20], Loss: 0.1340
              precision    recall  f1-score   support

           1     1.0000    0.0040    0.0079       253
           2     0.9690    1.0000    0.9843      7880

    accuracy                         0.9690      8133
   macro avg     0.9845    0.5020    0.4961      8133
weighted avg     0.9700    0.9690    0.9539      8133

