In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

df = pd.read_csv("../input/termproject-v5/bugs-train.csv")

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["summary"], df["severity"], test_size=0.2, random_state=42)

# Defining a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Get unique classes from the target variable
classes = df["severity"].unique()

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=classes, y=df["severity"])

# Convert class_weights to dictionary for RandomForestClassifier
class_weights_dict = dict(zip(classes, class_weights))

# Defining a Random Forest Classifier with class weights
rf_classifier = RandomForestClassifier(class_weight=class_weights_dict)

# Creating a pipeline
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', rf_classifier)
])

# Training the model
pipeline.fit(X_train, y_train)

# Predictions on the test set
predictions = pipeline.predict(X_test)

# Calculating macro precision
macro_precision = precision_score(y_test, predictions, average='macro')

print("Macro Precision:", macro_precision)

# Load the test data
test_data = pd.read_csv("../input/termproject-v5/bugs-test.csv")

# Get predictions for test data
test_predictions = pipeline.predict(test_data["summary"])

# Add predictions to the test data
test_data["severity"] = test_predictions
test_data = pd.DataFrame({"bug_id":test_data["bug id"], "severity":test_data["severity"]})

# Save the predictions to a CSV file in the specified directory
test_data.to_csv('predictions.csv', index=False)
