<a href="https://colab.research.google.com/github/deyanb06/SIE-tp3/blob/main/stackoverflow_tag_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [74]:
data =  pd.read_csv("drive/MyDrive/ML/stackoverflow_project/train.csv")  # Update with the path to your dataset

stopwords = set(stopwords.words('english'))

# Regular expression to match a string of repeated characters
repeated_char_pattern = re.compile(r'(.)\1+')
replace_space = re.compile('[/(){}\[\]\|@,;]')
bad_symbols = re.compile('[^a-z #+_]')

# Function for basic text cleaning
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = replace_space.sub(' ', text)  # Remove special characters and numbers
    text = bad_symbols.sub('', text) # delete symbols which are in bad_symbols from text
    # text = repeated_char_pattern.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopwords and not word.isnumeric()) # delete stopwords from text
    return text

data['cleaned'] = data['post'].apply(clean_text)
# print(data['cleaned'])

In [68]:
print(data['cleaned'])

0        causing behavior c# datetime type test public ...
1        dynamic html load iframe aspnet site users sav...
2        convert float value minsec trying convert seco...
3        net framework redistributable wondering get ne...
4        trying calculate print mean returning rather n...
                               ...                        
27995    sql join multiple tables pivot trying inner jo...
27996    simple sql query executing familiar mysql impr...
27997    different output end function rather main func...
27998    statego working # url appended end url login r...
27999    understanding mechanisms intentservice trying ...
Name: cleaned, Length: 28000, dtype: object


In [78]:

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(data['cleaned'])
count_array = tfidf_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = tfidf.get_feature_names_out())
print(df)

# # Encode the tags
# label_encoder = LabelEncoder()
# data['encoded_tags'] = label_encoder.fit_transform(data['tags'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned'], data['tags'], test_size=0.3, random_state=42
)


       __init__  __init__ self  __name__  _blank  _blank gt  _elem  \
0           0.0            0.0  0.000000     0.0        0.0    0.0   
1           0.0            0.0  0.000000     0.0        0.0    0.0   
2           0.0            0.0  0.000000     0.0        0.0    0.0   
3           0.0            0.0  0.000000     0.0        0.0    0.0   
4           0.0            0.0  0.080407     0.0        0.0    0.0   
...         ...            ...       ...     ...        ...    ...   
27995       0.0            0.0  0.000000     0.0        0.0    0.0   
27996       0.0            0.0  0.000000     0.0        0.0    0.0   
27997       0.0            0.0  0.000000     0.0        0.0    0.0   
27998       0.0            0.0  0.000000     0.0        0.0    0.0   
27999       0.0            0.0  0.000000     0.0        0.0    0.0   

       _elem _traitsgt  _files  _files file  _get  ...  ymd  youtube  yyyy  \
0                  0.0     0.0          0.0   0.0  ...  0.0      0.0   0.0   
1  

In [71]:

# Train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
print(lr_predictions)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions, average='weighted')

print(f"Logistic Regression Model Accuracy: {lr_accuracy * 100:.2f}%")
print(f"Logistic Regression Model F1 Score: {lr_f1:.2f}")


[10  2 15 ...  6  4 10]
Logistic Regression Model Accuracy: 79.05%
Logistic Regression Model F1 Score: 0.79


In [79]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

naive_bayes = Pipeline([("tfidf", TfidfVectorizer()),
                       ("naivebayes", MultinomialNB())])
naive_bayes.fit(X_train, y_train)

# accuracy of naive bayes classifier
print(naive_bayes.score(X_train, y_train))
print(naive_bayes.score(X_test, y_test))

0.8837244897959183
0.7348809523809524


In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer # for term-frequency-inverse document-frequency transformation

logit_regr = Pipeline([("vect", TfidfVectorizer()),
                       # works better with tfid without the need to increase max_iter
                       #("tfid", TfidfTransformer()),
                       ("logit", LogisticRegression(multi_class="multinomial", solver = "sag", max_iter=50))])
logit_regr.fit(X_train, y_train)

# accuracy of logistic regression
print('unprocessed:\n')
print(logit_regr.score(X_train, y_train))
print(logit_regr.score(X_test, y_test))

unprocessed:

0.9139795918367347
0.7935714285714286


In [108]:
test =  pd.read_csv("drive/MyDrive/ML/stackoverflow_project/test.csv")
pred = test.drop('post', axis=1)
pred.info()
# pred = pred.drop([['post']])
pred['tags'] = logit_regr.predict(test['post'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Id      12000 non-null  int64
dtypes: int64(1)
memory usage: 93.9 KB


In [109]:
csv_file_path = 'drive/MyDrive/ML/stackoverflow_project/output.csv'
pred.to_csv(csv_file_path, index=False)  # Set index=False to not write row indices
print(pred)

          Id       tags
0          2       html
1         12     iphone
2         13    asp.net
3         19     iphone
4         22  angularjs
...      ...        ...
11995  39972        php
11996  39980       html
11997  39984         c#
11998  39988     python
11999  40000    android

[12000 rows x 2 columns]
