In [None]:
import pandas as pd

train_data_df = pd.read_csv('/kaggle/input/siemens/train.csv')
test_data_df = pd.read_csv('/kaggle/input/siemens/test.csv')

In [None]:
import re
from bs4 import BeautifulSoup

def preprocess(text):
    text = BeautifulSoup(str(text))
    text = text.get_text()
    text = text.encode("ascii", "ignore")
    text = text.decode()
    text = text.replace("b'", "")
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    return text

In [None]:
train_data_df['New_Sentence'] = train_data_df['New_Sentence'].apply(preprocess)
test_data_df['New_Sentence'] = test_data_df['New_Sentence'].apply(preprocess)

In [None]:
label_dict = dict()
i = 0
for l in train_data_df['Type'].unique():
    label_dict[l] = i
    i += 1
labels = [label_dict[i] for i in train_data_df['Type']]
train_data_df['labels'] = labels
label_dict

In [None]:
train_data = train_data_df['New_Sentence'].to_list()
test_data = test_data_df['New_Sentence'].to_list()

In [None]:
train_data_df['labels'].plot(kind='hist', edgecolor='black')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_data, labels, test_size=0.2, random_state=1, stratify=labels)

### Linear Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
svm = SGDClassifier(loss = 'hinge', max_iter=3000, n_jobs=-1)
svm.fit(X_train_vec, y_train)
predictions = svm.predict(X_val_vec)
accuracy = accuracy_score(y_val, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_data)
test_vec = vectorizer.transform(test_data)

In [None]:
svm = SGDClassifier(loss = 'hinge', max_iter=3000, n_jobs=-1)
svm.fit(train_vec, labels)
predictions = svm.predict(test_vec)

In [None]:
res = []
for i in predictions:
    res.append(list(label_dict.keys())[list(label_dict.values()).index(i)])

In [None]:
result = test_data_df[['Sentence_id']]
result['Type'] = res
result.to_csv('/kaggle/working/svm_result.csv', index = False)

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
model = xgb.XGBClassifier(n_estimators = 1500, max_depth = 3, use_label_encoder=False)
model.fit(X_train_vec, y_train)
predictions = model.predict(X_val_vec)
accuracy = accuracy_score(y_val, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_data)
test_vec = vectorizer.transform(test_data)

In [None]:
model = xgb.XGBClassifier(n_estimators = 1500, max_depth = 3, use_label_encoder=False)
model.fit(train_vec, labels)
predictions = model.predict(test_vec)

In [None]:
res = []
for i in predictions:
    res.append(list(label_dict.keys())[list(label_dict.values()).index(i)])

In [None]:
result = test_data_df[['Sentence_id']]
result['Type'] = res
result.to_csv('/kaggle/working/XGBoost_result.csv', index = False)

### BERT

In [None]:
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
DEVICE

device(type='cuda')

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length=256, return_tensors="pt")
val_encodings = tokenizer(X_val, padding=True, truncation=True, max_length=256, return_tensors="pt")

In [None]:
from torch.utils.data import Dataset
class SentenceDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encode = SentenceDataset(train_encodings, y_train)
val_encode = SentenceDataset(val_encodings, y_val)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_encode, batch_size=32, shuffle=True)
val_loader = DataLoader(val_encode, batch_size=32, shuffle=False)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
model.to(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=5e-5)
model.train()

In [None]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()

        return correct_pred.float()/num_examples * 100

In [17]:
NUM_EPOCHS = 1
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        optim.zero_grad()
        loss.backward()
        optim.step()
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, val_loader, DEVICE):.2f}%')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch: 0001/0001 | Batch 0000/1503 | Loss: 1.7815
Epoch: 0001/0001 | Batch 0250/1503 | Loss: 0.7808
Epoch: 0001/0001 | Batch 0500/1503 | Loss: 0.4839
Epoch: 0001/0001 | Batch 0750/1503 | Loss: 0.5863
Epoch: 0001/0001 | Batch 1000/1503 | Loss: 0.7698
Epoch: 0001/0001 | Batch 1250/1503 | Loss: 0.4696
Epoch: 0001/0001 | Batch 1500/1503 | Loss: 0.9134
Training accuracy: 81.98%
Valid accuracy: 78.18%


In [22]:
torch.save(model, '/kaggle/working/BERT.pt')

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_data, padding=True, truncation=True, max_length=256, return_tensors="pt")
train_encode = SentenceDataset(train_encodings, train_data_df['labels'].to_list())
train_loader = DataLoader(train_encode, batch_size=16, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
model.to(DEVICE)
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

In [None]:
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        optim.zero_grad()
        loss.backward()
        optim.step()
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')

In [None]:
torch.save(model, '/kaggle/working/Full_BERT.pt')

In [None]:
model = torch.load('/kaggle/working/Full_BERT.pt')
model.to(DEVICE)
model.eval()
predictions = []

for i in range(0, len(test_encodings['input_ids']), 32):
    input_ids = test_encodings['input_ids'][i:i+32].to(DEVICE)
    attention_masks = test_encodings['attention_mask'][i:i+32].to(DEVICE)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        outputs = torch.argmax(outputs['logits'], dim=1).tolist()
    predictions.extend(outputs)

In [None]:
res = []
for i in predictions:
    res.append(list(label_dict.keys())[list(label_dict.values()).index(i)])

In [None]:
result = test_data_df[['Sentence_id']]
result['Type'] = res
result.to_csv('/kaggle/working/BERT_result.csv', index = False)