In [None]:
import json
import pandas as pd
import syntok.segmenter as segmenter
import os

# Import Datasets

In [None]:
# 1st Features Dimension: Repository
repo_feat_accept_df = pd.read_csv('./1_Repository DImension/repo_metrics_accepted.csv')
repo_feat_reject_df = pd.read_csv('./1_Repository DImension/repo_metrics_rejected.csv')

# 2nd Features Dimension: Pull Request
pr_feat_accept_df = pd.read_csv('./2_Pull Request Dimension/pr_metrics_accepted.csv')
pr_feat_reject_df = pd.read_csv('./2_Pull Request Dimension/pr_metrics_rejected.csv')

# 3rd Features Dimension: Developer Experience
pr_dev_accept_df = pd.read_csv('./3_Dev_Experience/pr_dev_metrics_accepted.csv')
pr_dev_reject_df = pd.read_csv('./3_Dev_Experience/pr_dev_metrics_rejected.csv')

# 5th Features Dimension: Readability
pr_read_accept_df = pd.read_csv('./5_Readability/issue_pr_readability_accepted.csv')
pr_read_reject_df = pd.read_csv('./5_Readability/issue_pr_readability_rejected.csv')

# Combined and Filter the data

In [118]:
# ============================================================
# 1. Re-create the three combined dataframes (Logic from Cell 97)
# Note: This code assumes the raw dataframes (e.g., pr_feat_accept_df) are available.
# ============================================================

# --- 2nd Features Dimension: Pull Request ---
columns_to_drop_2 = ['Repo', 'NumPathsInFile', 'AvgPathCharLength', 'MaxPathCharLength']
rename_map_2 = {'PR_number': 'PR_Number'}
pr_accept_df_2 = pr_feat_accept_df.drop(columns=columns_to_drop_2, axis=1, inplace=False).rename(columns=rename_map_2)
pr_reject_df_2 = pr_feat_reject_df.drop(columns=columns_to_drop_2, axis=1, inplace=False).rename(columns=rename_map_2)
pr_accept_df_2['Result'] = 1
pr_reject_df_2['Result'] = 0
combined_df_2 = pd.concat([pr_accept_df_2, pr_reject_df_2], ignore_index=True)

# --- 3rd Features Dimension: Developer Experience ---
columns_to_drop_3 = ['Repo', 'Creation_Date', 'User', 'Total_Commits_Repo', 'Creation_Date']
pr_accept_df_3 = pr_dev_accept_df.drop(columns=columns_to_drop_3, axis=1, inplace=False)
pr_reject_df_3 = pr_dev_reject_df.drop(columns=columns_to_drop_3, axis=1, inplace=False)
pr_accept_df_3['Result'] = 1
pr_reject_df_3['Result'] = 0
combined_df_3 = pd.concat([pr_accept_df_3, pr_reject_df_3], ignore_index=True)

# --- 5th Features Dimension: Readability ---
columns_to_drop_5 = ['type', 'pr_title', 'pr_body', 'issue_number', 'issue_title', 'issue_body']
rename_map_5 = {'pr_number': 'PR_Number'}
pr_accept_df_5 = pr_read_accept_df.drop(columns=columns_to_drop_5, axis=1, inplace=False).rename(columns=rename_map_5)
pr_reject_df_5 = pr_read_reject_df.drop(columns=columns_to_drop_5, axis=1, inplace=False).rename(columns=rename_map_5)
pr_accept_df_5['Result'] = 1
pr_reject_df_5['Result'] = 0
combined_df_5 = pd.concat([pr_accept_df_5, pr_reject_df_5], ignore_index=True)

# ============================================================
# 2. Perform Left Joins
# ============================================================

# Drop redundant 'Result' column from subsequent merges
combined_df_3_cleaned = combined_df_3.drop(columns=['Result'])
combined_df_5_cleaned = combined_df_5.drop(columns=['Result'])

final_combined_df = combined_df_2.copy()

# Left Merge 1: Keeps all rows from combined_df_2, adding features from combined_df_3
final_combined_df = pd.merge(
    final_combined_df,
    combined_df_3_cleaned,
    on='PR_Number',
    how='left'
)

# Left Merge 2: Keeps all rows from the previous step, adding features from combined_df_5
final_combined_df = pd.merge(
    final_combined_df,
    combined_df_5_cleaned,
    on='PR_Number',
    how='left'
)

# ============================================================
# 3. Fill NaNs with 0
# ============================================================
final_combined_df.fillna(0, inplace=True)

# ============================================================
# 4. Finalize and Display
# ============================================================
# Set PR_Number as the index
final_combined_df.set_index('PR_Number', inplace=True)
final_combined_df.sort_index(ascending=True, inplace=True)

# Reorder columns to put Result last
cols = final_combined_df.columns.tolist()
result_col = 'Result'
feature_cols_final = [col for col in cols if col != result_col]
new_col_order = feature_cols_final + [result_col]
final_combined_df = final_combined_df[new_col_order]

final_combined_df.to_csv("./test_ML_datasets.csv", index=True)

# Machine Learning 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Separate Features (X) and Target (y)
# PR_Number is already the index and is excluded from features X
X = final_combined_df.drop(columns=['Result'])
y = final_combined_df['Result']

# 2. Split the data into training (70%) and testing (30%) sets
# Using stratify=y ensures the proportion of Accepted/Rejected PRs is maintained in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# 3. Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# 5. Train, Predict, and Evaluate
results = {}

for name, model in models.items():
    # Use scaled data only for Logistic Regression
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)      # train model
        y_pred = model.predict(X_test_scaled)   # Make prediction
    else:
        # Tree-based models (DT, RF, XGB) do not require scaling
        model.fit(X_train, y_train)             # train model
        y_pred = model.predict(X_test)          # Make prediction

    # Store results
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred, target_names=['Rejected (0)', 'Accepted (1)'])
    }
    
print(results)   

# 6. Print Summary of Results
print("\n--- DETAILED CLASSIFICATION REPORTS ---")
for name, res in results.items():
    print(f"\nModel: {name}")
    print(res['Classification Report'])

{'Logistic Regression': {'Accuracy': 0.8036175710594315, 'Classification Report': '              precision    recall  f1-score   support\n\nRejected (0)       0.58      0.26      0.36        82\nAccepted (1)       0.83      0.95      0.88       305\n\n    accuracy                           0.80       387\n   macro avg       0.70      0.60      0.62       387\nweighted avg       0.77      0.80      0.77       387\n'}, 'Decision Tree': {'Accuracy': 0.8036175710594315, 'Classification Report': '              precision    recall  f1-score   support\n\nRejected (0)       0.54      0.51      0.53        82\nAccepted (1)       0.87      0.88      0.88       305\n\n    accuracy                           0.80       387\n   macro avg       0.70      0.70      0.70       387\nweighted avg       0.80      0.80      0.80       387\n'}, 'Random Forest': {'Accuracy': 0.8268733850129198, 'Classification Report': '              precision    recall  f1-score   support\n\nRejected (0)       0.64      0.4

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Define the file paths as provided in the prompt
file_accept_path = './5_Readability/issue_pr_readability_accepted.csv'
file_reject_path = './5_Readability/issue_pr_readability_rejected.csv'

# Read the dataframes
pr_read_accept_df = pd.read_csv(file_accept_path)
pr_read_reject_df = pd.read_csv(file_reject_path)

# Count the number of negative decimal numbers in the 'issue_readability' column
# A negative decimal number will be < 0
negative_accept_count = (pr_read_accept_df['issue_readability'] < 0).sum()
negative_reject_count = (pr_read_reject_df['issue_readability'] < 0).sum()

negative_accept_count_pr = (pr_read_accept_df['pr_readability'] < 0).sum()
negative_reject_count_pr = (pr_read_reject_df['pr_readability'] < 0).sum()

print(f"Number of negative score in issue body (Accepted PRs): {negative_accept_count}")
print(f"Number of negative score in issue body (Rejected PRs): {negative_reject_count}")

print(f"Number of negative score in PR body (Accepted PRs): {negative_accept_count_pr}")
print(f"Number of negative score in PR body (Rejected PRs): {negative_reject_count_pr}")



Number of negative score in issue body (Accepted PRs): 12
Number of negative score in issue body (Rejected PRs): 17
Number of negative score in PR body (Accepted PRs): 7
Number of negative score in PR body (Rejected PRs): 9
