In [1]:
CUDA_VISIBLE_DEVICES=0

In [2]:
import torch
import torch.nn as nn
import os
import logging
import numpy as np
import random
from tqdm import tqdm
import time
import pandas as pd

from transformers import LongformerModel, AutoTokenizer, LongformerForSequenceClassification, LongformerForMultipleChoice
from transformers import AutoTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

logging.basicConfig(filename=f'./logs/train_{time.asctime().replace(" ","_")}.log', filemode='w', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a logger object
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create a stream handler to print log messages to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

torch.manual_seed(40)
np.random.seed(40)
random.seed(40)
torch.cuda.manual_seed(40)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define the path to the CSV file
train_csv_file = "/data1/debajyoti/colie/train.csv"
val_csv_file = "/data1/debajyoti/colie/valid.csv"
test_csv_file = "/data1/debajyoti/colie/test.csv"

# Read the CSV file
train_labels = pd.read_csv(train_csv_file)
val_labels = pd.read_csv(val_csv_file)
test_labels = pd.read_csv(test_csv_file)
test_labels

Unnamed: 0,BOOK_id
0,7616_1.txt
1,7616_2.txt
2,7616_3.txt
3,7616_4.txt
4,7616_5.txt
...,...
143025,5677_92.txt
143026,5677_93.txt
143027,5677_94.txt
143028,5677_95.txt


In [4]:
train_labels.BOOK_id[0]

'27993_1.txt'

In [5]:
# Define the path to the train folder
train_folder = "/data1/debajyoti/colie/train/train/"
# Define the path to the validation folder
val_folder = "/data1/debajyoti/colie/valid/valid/"
# Define the path to the test folder
test_folder = "/data1/debajyoti/colie/test/test/"



def create_df(folder, label):
    # Initialize empty lists to store the data
    text_data = []
    labels = []
    for index in label.index:
        # filename = df_labels.BOOK_id[index]
        # print(filename)
        # print(df_labels['BOOK_id'][index], df_labels['Epoch'][index])
        file_name = label['BOOK_id'][index]  # Assuming 'File Name' is the column name for the file names in the CSV

        # Construct the file path
        file_path = os.path.join(folder, file_name)

        # Read the text from the file
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()

        # Append the text and label to the respective lists
        text_data.append(text)
        labels.append(label['Epoch'][index].strip())  # Assuming 'Label' is the column name for the labels in the CSV
        # break
    return text_data, labels

def create_df_test(folder, label):
    # Initialize empty lists to store the data
    text_data = []
    # labels = []
    for index in label.index:
        # filename = df_labels.BOOK_id[index]
        # print(filename)
        # print(df_labels['BOOK_id'][index], df_labels['Epoch'][index])
        file_name = label['BOOK_id'][index]  # Assuming 'File Name' is the column name for the file names in the CSV

        # Construct the file path
        file_path = os.path.join(folder, file_name)

        # Read the text from the file
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()

        # Append the text and label to the respective lists
        text_data.append(text)
        # labels.append(label['Epoch'][index].strip())  # Assuming 'Label' is the column name for the labels in the CSV
        # break
    return text_data

train_data, train_label = create_df(train_folder, train_labels)
val_data, val_label = create_df(val_folder, val_labels)
test_data = create_df_test(test_folder, test_labels)

# Create a dataframe from the lists
train = pd.DataFrame({'text': train_data, 'label': train_label})
val = pd.DataFrame({'text': val_data, 'label': val_label})
test = pd.DataFrame({'text': test_data})
print(train.head(), val.head(), test.head())
print(train.shape, val.shape, test.shape)

                                                text      label
0  rifle; Ivan's was a double-barrelled shot-gun ...  Viktorian
1  upon the track of the bear. After following it...  Viktorian
2  to pull him out with their hands--even had the...  Viktorian
3  a slight sparkle of scientific conceit, "this ...  Viktorian
4  bears with a white ring round their necks? Yes...  Viktorian                                                 text      label
0  kind good morning, and returned her hearty emb...  Viktorian
1  sky, and of the moon, which clothed the old pi...  Viktorian
2  left Rome for Augsburg, my mind being much exc...  Viktorian
3  thoughts some of the old melodies he knew by h...  Viktorian
4  "But," said Henry, "is it not possible that th...  Viktorian                                                 text
0  "Alas, poor girl!" said I, "I fear that her ha...
1  to divide her attention between the said garco...
2  visitor's disposition to gallantry. However, s...
3  says Juvenal, "'M

In [6]:
label_dic = {'Romanticism':0,
            'Viktorian':1,
            'Modernism':2,
            'PostModernism':3,
            'OurDays':4}
train['label'] = train['label'].map(label_dic)
val['label'] = val['label'].map(label_dic)
# test['label'] = test['label'].map(label_dic)

In [8]:
# Length of text
def length (txt):
    length = len(txt.split())
    return length

txt_length = train['text'].apply(lambda x: length(x))
print(txt_length.sort_values(ascending = False))

483268    1128
483267    1068
521384    1065
483265    1034
81542     1020
          ... 
470405       1
130188       1
217335       1
351867       1
368135       1
Name: text, Length: 546210, dtype: int64


In [9]:
val['label'].value_counts()

label
1    16938
2    14848
3     1713
4     1600
0     1158
Name: count, dtype: int64

In [11]:
max_length= 500
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df):
        # Initialize thetokenizer
        self.tokenizer = tokenizer

        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Get the text and label from the dataframe
        text = self.df.iloc[index]['text']
        label = self.df.iloc[index]['label']

        # Tokenize the text and convert it to input IDs
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=False,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )


        # Return the input IDs and label as PyTorch tensors
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            # 'token_type_ids': inputs['token_type_ids'][0],
            'label': torch.tensor(label, dtype=torch.int64),
        }

# datasetclass = CustomDataset(tokenizer, train)
train_dataset = CustomDataset(tokenizer, train)
val_dataset = CustomDataset(tokenizer, val)
# test_dataset = CustomDataset(tokenizer, test)

# DataLoader
batch_size = 256
train_dataloader = tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=64))
val_dataloader = tqdm(DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=64))
# test_dataloader = tqdm(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))

  0%|          | 0/2134 [00:00<?, ?it/s]

In [18]:
class CustomDataset_test(Dataset):
    def __init__(self, tokenizer, df):
        # Initialize the tokenizer
        self.tokenizer = tokenizer

        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Get the text and label from the dataframe
        text = self.df.iloc[index]['text']
        # label = self.df.iloc[index]['label']

        # Tokenize the text and convert it to input IDs
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=False,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )


        # Return the input IDs and label as PyTorch tensors
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            # 'token_type_ids': inputs['token_type_ids'][0],
            # 'label': torch.tensor(label, dtype=torch.int64),
        }

test_dataset = CustomDataset_test(tokenizer, test)

# DataLoader
test_dataloader = tqdm(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))


[A

In [23]:
label_dic = {0:'Romanticism',
            1:'Viktorian',
            2:'Modernism',
            3:'PostModernism',
            4:'OurDays'}
test_labels['Epoch'] = test_labels['Epoch'].map(label_dic)
test_labels

Unnamed: 0,BOOK_id,Epoch
0,7616_1.txt,Viktorian
1,7616_2.txt,Viktorian
2,7616_3.txt,Viktorian
3,7616_4.txt,Viktorian
4,7616_5.txt,Viktorian
...,...,...
143025,5677_92.txt,Viktorian
143026,5677_93.txt,Modernism
143027,5677_94.txt,Modernism
143028,5677_95.txt,Modernism


In [7]:
# Import libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from scipy.sparse import hstack

In [None]:
# Create the feature extractor using trigrams
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))

# Transform the data into trigram feature vectors
X_trigrams = trigram_vectorizer.fit_transform(X)

In [None]:
# Create the feature extractor using bag of words
bow_vectorizer = CountVectorizer()

# Transform the data into bag of words feature vectors
X_bow = bow_vectorizer.fit_transform(X)

In [None]:
# Concatenate the trigram features and bag of words features
X_features = hstack([X_trigrams, X_bow])

In [None]:
# Perform feature selection using chi2 #2KFH
k = 400 # Select the top k features 
feature_selector = SelectKBest(chi2, k=k)
X_selected = feature_selector.fit_transform(X_features, y)

In [None]:
# Create the Naive Bayes classifier
nb_classifier = MultinomialNB()

In [None]:
# Perform cross-validation and print the classification report
cv_results = cross_val_score(nb_classifier, X_selected, y, cv=10, scoring='accuracy')
classification_report_cv = classification_report(y, cross_val_predict(nb_classifier, X_selected, y, cv=10))
mean_accuracy = cv_results.mean()

In [24]:
# Import libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from scipy.sparse import hstack


In [8]:
train_df = pd.concat([train, val], ignore_index=True, axis=0)
train

Unnamed: 0,text,label
0,rifle; Ivan's was a double-barrelled shot-gun ...,1
1,upon the track of the bear. After following it...,1
2,to pull him out with their hands--even had the...,1
3,"a slight sparkle of scientific conceit, ""this ...",1
4,bears with a white ring round their necks? Yes...,1
...,...,...
546205,"the manner described in the text, might lay cl...",2
546206,"surnamed, answered, Ã¢ÂÂNa, na, there are na...",2
546207,that of Themis. My informant was Alexander Kei...,2
546208,"a heavy blow. cloyed a dud, stolen a rag. coll...",2


In [9]:
# Assuming you have separate dataframes for train, val, and test
train_data = train_df['text']
train_labels = train_df['label']

# val_data = val['text']
# val_labels = val['label']

test_data = test['text']

In [10]:
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
# val_vectors = vectorizer.transform(val_data)
test_vectors = vectorizer.transform(test_data)

In [11]:
# # Create the feature extractor using trigrams
# trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))

# # Transform the data into trigram feature vectors
# train_vectors_trigrams = trigram_vectorizer.fit_transform(train_data)

In [12]:
# # Concatenate the trigram features and bag of words features
# X_features = hstack([train_vectors, train_vectors_trigrams])

In [13]:
# # Perform feature selection using chi2 #2KFH
# k = 2400 # Select the top k features 
# feature_selector = SelectKBest(chi2, k=k)
# X_selected = feature_selector.fit_transform(X_features, train_labels)

In [14]:
classifier = SVC()
classifier.fit(train_vectors, train_labels)

In [None]:
test_predictions = classifier.predict(test_vectors)

In [43]:
# Perform cross-validation and print the classification report
cv_results = cross_val_score(classifier, X_selected, train_labels, cv=10, scoring='accuracy')
classification_report_cv = classification_report(train_labels, cross_val_predict(classifier, X_selected, train_labels, cv=10))
mean_accuracy = cv_results.mean()
mean_accuracy

0.5512879305459965

In [44]:
print(classification_report_cv)

              precision    recall  f1-score   support

           0       0.17      0.67      0.27     21319
           1       0.66      0.53      0.59    257303
           2       0.70      0.61      0.65    252936
           3       0.21      0.41      0.27     23852
           4       0.23      0.17      0.20     27057

    accuracy                           0.55    582467
   macro avg       0.39      0.48      0.40    582467
weighted avg       0.62      0.55      0.57    582467



In [14]:
test_predictions = classifier.predict(test_vectors)

In [15]:
test_predictions

array([1, 1, 0, ..., 2, 2, 2])

In [None]:
test_labels["Epoch"] = test_predictions

In [None]:
label_dic = {0:'Romanticism',
            1:'Viktorian',
            2:'Modernism',
            3:'PostModernism',
            4:'OurDays'}
test_labels['Epoch'] = test_labels['Epoch'].map(label_dic)
test_labels

Unnamed: 0,BOOK_id,Epoch
0,7616_1.txt,Viktorian
1,7616_2.txt,Viktorian
2,7616_3.txt,Romanticism
3,7616_4.txt,Romanticism
4,7616_5.txt,Romanticism
...,...,...
143025,5677_92.txt,Modernism
143026,5677_93.txt,Modernism
143027,5677_94.txt,Modernism
143028,5677_95.txt,Modernism


In [None]:
test_labels.to_csv('/data1/debajyoti/colie/submission/submission_6.csv', index=False)