### https://github.com/HarmanDotpy/Named-Entity-Recognition-in-Pytorch/blob/main/scripts/train_bilstm_char_random_glove.py

In [43]:
import torch
torch.manual_seed(10)
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
import io
import sklearn
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle as pickle
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import seqeval
from seqeval.metrics import accuracy_score as seq_accuracy_score
from seqeval.metrics import classification_report as seq_classification_report
from seqeval.metrics import f1_score as seq_f1_score
import pandas as pd

In [44]:
## BILSTM model

class BiLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, total_words, num_class, pretrained = False, pretrained_embed = None):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.wordembed = nn.Embedding.from_pretrained(pretrained_embed, freeze = False)
        self.dropout = nn.Dropout(p = 0.5)
        self.bilstm = nn.LSTM(embedding_size,hidden_size, bidirectional = True, batch_first = True)
        self.linear = nn.Linear(2*hidden_size, num_class) # 2 because forward and backward concatenate

    def forward(self, x, xlengths): 
        x = pack_padded_sequence(x, xlengths.cpu(), batch_first=True, enforce_sorted=False)
        x, _ = pad_packed_sequence(x, batch_first=True)
        word_embedding = self.wordembed(x) # x is of size(batchsize, seq_len), out is of size (batchsize, seq_len, embedding_size = 100)
        # word_embedding = self.fcembed(word_embedding)
        word_embedding = self.dropout(word_embedding) # dropout

        out, (h,c) = self.bilstm(word_embedding) #'out' has dimension(batchsize, seq_len, 2*hidden_size)
        out = self.linear(out) # now 'out' has dimension(batchsize, seq_len, num_class)
        out = out.view(-1, out.shape[2]) # shape (128*seqlen, 18)
        out = F.log_softmax(out, dim=1) # take the softmax across the dimension num_class, 'out' has dimension(batchsize, seq_len, num_class)
        return out

#### HELPER FUNCTIONS

In [81]:
# reading text file in python and making list of sentences (list of lists) and list of tags(list of lists)
def load_data(datapath, buildvocab_tags= True, vocab = None, nertags = None):
    if(buildvocab_tags == True):
        all_words = []
        all_tags = []
        with open(datapath) as f:
            lines = f.readlines()
            sent_num = 0
            for line in lines[2:]: #1: so that the first blank line isn't taken into account
                if(line == "\n"):
                    sent_num+=1
                else:
                    line_sep = line.split(sep = " ")
                    all_words.append(line_sep[0])
                    all_tags.append(line_sep[3][:-1])
                    
        words = list(set(all_words))
        tags = list(set(all_tags))

        vocab = {}
        vocab['<pad>'] = 0 # for padding input sequences
        vocab['<oov>'] = 1
        for i, word in enumerate(words):
            vocab[word] = i+2
            
        nertags = {}
        nertags['padtag'] = 0
        for i,nertag in enumerate(tags):
            nertags[nertag] = i+1

    train_sent = []
    train_tags = []
    with open(datapath) as f:
        lines = f.readlines()
        sent_num = 0
        sentence = []
        tag = []
        for line in lines[2:]: #1: so that the first blank line isn't taken into account
            if(line == "\n"):
                sent_num+=1
                train_sent.append(sentence)
                train_tags.append(tag)
                sentence = []
                tag = []
            else:
                line_sep = line.split(sep = " ")
                if(line_sep[0] in vocab.keys()):
                    sentence.append(vocab[line_sep[0]])
                else:
                    sentence.append(vocab['<oov>'])
                    
                tag.append(nertags[line_sep[3][:-1]])

    # padding the sentences at the end
    seq_maxlen = max(len(x) for x in train_sent)
    x_lengths = [len(x) for x in train_sent]
    Xtrain = []
    Ytrain = []
    for sent, tags in zip(train_sent, train_tags):
        length_toappend = seq_maxlen - len(sent)
        Xtrain.append(sent+[0]*length_toappend)
        Ytrain.append(tags+[0]*length_toappend)


    Xtrain = torch.Tensor(Xtrain)
    Ytrain = torch.Tensor(Ytrain)
    x_lengths = torch.Tensor(x_lengths)
    print(Xtrain.shape, Ytrain.shape, x_lengths.shape)
    
    return Xtrain, Ytrain, x_lengths, vocab, nertags


#### Loading data

In [46]:
traindatapath = 'data/train.txt'
devdatapath = 'data/dev.txt'
testdatapath = 'data/test.txt'


Xtrain, Ytrain, x_trainlengths, vocab, nertags = load_data(traindatapath, buildvocab_tags=True)
Xdev, Ydev, x_devlengths, _, _ = load_data(devdatapath, buildvocab_tags=False, vocab = vocab, nertags = nertags)

torch.Size([14986, 113]) torch.Size([14986, 113]) torch.Size([14986])
torch.Size([3465, 109]) torch.Size([3465, 109]) torch.Size([3465])


In [47]:
traindataset = TensorDataset(Xtrain, Ytrain, x_trainlengths)
Trainloader = DataLoader(traindataset, batch_size= 128, shuffle=True)

devdataset = TensorDataset(Xdev, Ydev, x_devlengths)
Devloader = DataLoader(devdataset, batch_size= 128, shuffle=True)

In [48]:
# DEFINE MY MODEL!!! 

pre_embeddings = 'glove'
Expname = 'BILSTM_glove'
rootpath = 'out/'
glove_embeddings_file = 'data/glove.6B.100d.txt'

In [49]:
if torch.cuda.is_available():  
    device = "cuda:0" 
else:  
    device = "cpu"  

In [50]:
# LOAD EMBEDDINGS
embedding_size = 100
if(pre_embeddings == "glove"):
    gloveembeddings_index = {}
    with io.open(glove_embeddings_file, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:],dtype='float32')
            gloveembeddings_index[word] = coefs

    #using vocab and Xtrain, Xvalid, get pretrained glove word embeddings
    glove_embeds = np.zeros((len(vocab), embedding_size))
    for word in vocab.keys():
        if(word in gloveembeddings_index.keys()):
            # for the pad word let theembedding be all zeros
            glove_embeds[vocab[word]] = gloveembeddings_index[word]
        else:
            glove_embeds[vocab[word]] = np.random.randn(embedding_size)
    word_embeds = torch.Tensor(glove_embeds)
    # print(glove_embeds.shape) # shape (vocablength , embedding dim)


In [51]:
# classes to be looked at for performance metrics
imp_classes = [nertags[tag] for tag in nertags.keys()]
imp_classes.remove(nertags['padtag'])
imp_classes.remove(nertags['O'])

In [52]:
model = BiLSTM(embedding_size = 100, hidden_size = 100, total_words = len(vocab), num_class = len(nertags), pretrained = True, pretrained_embed = word_embeds).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
lossfunction = nn.CrossEntropyLoss()

In [53]:
def performance(y, ypred, nertags):
    y = y.numpy()
    ypred = ypred.numpy()
    mask = (y != nertags['padtag']) * (y != nertags['O'])
    y = y*mask
    ypred = ypred*mask
    acc = ((y==ypred)*mask).sum()/mask.sum()
    microf1 = f1_score(y, ypred, labels = imp_classes, average='micro')
    macrof1 = f1_score(y, ypred, labels = imp_classes, average='macro')

    return acc, microf1, macrof1

In [54]:
def validate(model, loader):
        with torch.no_grad():
            validloss = 0
            acc = 0
            microf1 = 0
            macrof1 = 0
            i = 0
            for step, (X, Y, xlen) in enumerate(loader):
                Y = pack_padded_sequence(Y, xlen, batch_first=True, enforce_sorted=False)
                Y, _ = pad_packed_sequence(Y, batch_first=True)
                ypred = model(X.long().to(device), xlen.to(device))#.permute(0, 2, 1)
                vloss = lossfunction(ypred.to('cpu'), Y.view(-1).type(torch.LongTensor))
                validloss+=vloss.item()
                acc_, microf1_, macrof1_ = performance(Y.view(-1), torch.argmax(ypred.to('cpu'), dim = 1), nertags)
                acc+=acc_
                microf1 += microf1_
                macrof1 += macrof1_
                i+=1

        return validloss/i, acc/i, microf1/i, macrof1/i

In [55]:
trainlosslist = []
trainacclist = []
trainmicrof1list = []
trainmacrof1list = []

validlosslist = []
valacclist = []
valmicrof1list = []
valmacrof1list = []

In [73]:
# Model is ready now we have to train using cross entropy loss
num_epochs = 10

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# validloss = []
model.train()
for epoch in range(num_epochs):
    #if(epoch == 8):
     #   optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
        
    totalloss, acc, microf1, macrof1 = 0, 0, 0, 0
    for step, (Xbatch ,Ybatch, xbatch_len) in enumerate(Trainloader):
        #make gradients 0
        optimizer.zero_grad()

        Ybatch = pack_padded_sequence(Ybatch, xbatch_len, batch_first=True, enforce_sorted=False)
        Ybatch, y_lengths = pad_packed_sequence(Ybatch, batch_first=True)

        #get output from model and claculate loss
        ypred = model(Xbatch.long().to(device), xbatch_len.to(device))#.permute(0, 2, 1)
        
        acc_, microf1_, macrof1_ = performance(Ybatch.view(-1), torch.argmax(ypred.to('cpu'), dim = 1), nertags)
        acc+= acc_
        microf1+=microf1_
        macrof1+=macrof1_
        if(step%20 == 0 and step !=0):
            print("training accuracy = {}, microF1 = {}, macroF1 = {}".format(acc/(step+1), microf1/(step+1), macrof1/(step+1)))
            
        loss = lossfunction(ypred.to('cpu'), Ybatch.view(-1).type(torch.LongTensor)) #Ybatch has dimension (batchsize, seqlen), ypred has dimension(batchsize, num_classes, seqlen)
        totalloss += loss.item()

        #backward and step
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5) # clip gradient to 5
        optimizer.step()
        
    trainlosslist.append(totalloss/(step+1))
    trainacclist.append(acc/(step+1))
    trainmicrof1list.append(microf1/(step+1))
    trainmacrof1list.append(macrof1/(step+1))

    # model validation loss and scheduler step for learning rate change if required
    val_loss, val_acc, val_microf1, val_macrof1  = validate(model, Devloader)
    validlosslist.append(val_loss)
    valacclist.append(val_acc)
    valmicrof1list.append(val_microf1)
    valmacrof1list.append(val_macrof1)
        
    # scheduler.step(val_loss)
    print('\nepoch = {}, training_loss = {}, validation_loss ={}, training_acc = {}, validation_acc ={}'.format(epoch, trainlosslist[-1], validlosslist[-1], trainacclist[-1], valacclist[-1]))        
    

training accuracy = 0.9705395665173524, microF1 = 0.9759409986889355, macroF1 = 0.9668993079017264
training accuracy = 0.9713021428052855, microF1 = 0.9764773525670124, macroF1 = 0.9686703757666425
training accuracy = 0.969046301756781, microF1 = 0.974298244926513, macroF1 = 0.9668866153570084
training accuracy = 0.9682213645570705, microF1 = 0.9735829926050069, macroF1 = 0.9659688234845565
training accuracy = 0.9688201387671389, microF1 = 0.9742000792062591, macroF1 = 0.9669517318400818

epoch = 0, training_loss = 0.0059711401936819114, validation_loss =0.05561547114380768, training_acc = 0.9682280837383773, validation_acc =0.811799442128865
training accuracy = 0.9689954375995278, microF1 = 0.9741614909234774, macroF1 = 0.9633264679763927
training accuracy = 0.9703555731364956, microF1 = 0.9754709777575805, macroF1 = 0.9677807895376281
training accuracy = 0.9707369968098122, microF1 = 0.9757602196455846, macroF1 = 0.9685754896913688
training accuracy = 0.9707096718794264, microF1 = 0.

In [74]:
id2word = {id: word for word, id in vocab.items()} 
id2tag = {}
for tag in nertags.keys():
    if(tag == 'padtag'):
         id2tag[nertags[tag]] = 'O' 
    else:
        id2tag[nertags[tag]] = tag

In [75]:
model.eval()

BiLSTM(
  (wordembed): Embedding(23626, 100)
  (dropout): Dropout(p=0.5, inplace=False)
  (bilstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=200, out_features=10, bias=True)
)

In [76]:
import os
if not os.path.exists(rootpath):
       os.mkdir(rootpath)

if not os.path.exists(rootpath+Expname):
    os.mkdir(rootpath+Expname)

In [82]:
def final_metrics(model,loader):
    y_predicted = []
    y_true = []
    with torch.no_grad():
        for step, (X, Y, xlen) in enumerate(loader):
            Y = pack_padded_sequence(Y, xlen, batch_first=True, enforce_sorted=False)
            Y, _ = pad_packed_sequence(Y, batch_first=True)
            ypred = model(X.long().to(device), xlen.to(device))#.permute(0, 2, 1)
            ypred = torch.argmax(ypred.to('cpu'), dim = 1)
            ypred = ypred.view(Y.shape[0], -1)
            y_predicted.append(ypred)
            y_true.append(Y)

    y_predicted_list = []
    y_true_list = []
    for i in range(len(y_predicted)):
        for j in range(y_predicted[i].shape[0]):
            sent_pred = []
            sent_true = []
            for x in range(y_predicted[i].shape[1]):
                sent_pred.append(id2tag[int(y_predicted[i][j, x])])
                sent_true.append(id2tag[int(y_true[i][j, x])])
            y_predicted_list.append(sent_pred)
            y_true_list.append(sent_true)
    
    return seq_f1_score(y_true_list, y_predicted_list), seq_accuracy_score(y_true_list, y_predicted_list), seq_classification_report(y_true_list, y_predicted_list, digits = 3)

In [83]:
#Test DATASET
Xtest, Ytest, x_testlengths, _, _ = load_data(testdatapath, buildvocab_tags=False, vocab = vocab, nertags = nertags)

testdataset = TensorDataset(Xtest, Ytest, x_testlengths)
loader_test = DataLoader(testdataset, batch_size= 1, shuffle=False)
test_f1_conll, test_acc_conll, test_classif_report = final_metrics(model, loader_test)

print("PERFORMANCE ON Test DATA")
print('MicroF1 = {}'.format(test_f1_conll))
print('Accuracy = {}'.format(test_acc_conll))
print('------------Classification Report-------------')
print(test_classif_report)

torch.Size([3683, 124]) torch.Size([3683, 124]) torch.Size([3683])
PERFORMANCE ON Test DATA
MicroF1 = 0.7276116091239903
Accuracy = 0.9392264009428908
------------Classification Report-------------
              precision    recall  f1-score   support

         LOC      0.880     0.775     0.824      1668
        MISC      0.721     0.681     0.700       702
         ORG      0.804     0.609     0.693      1661
         PER      0.591     0.814     0.685      1617

   micro avg      0.729     0.726     0.728      5648
   macro avg      0.749     0.720     0.726      5648
weighted avg      0.755     0.726     0.730      5648



In [84]:
def out_predictions(model, loader, output_file):
    with open(output_file, 'w') as f:
        with torch.no_grad():
            for step, (X, Y, xlen) in enumerate(loader):
                Y = pack_padded_sequence(Y, xlen, batch_first=True, enforce_sorted=False)
                Y, _ = pad_packed_sequence(Y, batch_first=True)
                ypred = model(X.long().to(device), xlen.to(device))
                ypred = torch.argmax(ypred.to('cpu'), dim=1)
                ypred = ypred.view(Y.shape[0], -1)
                for i in range(len(ypred)):
                    for j in range(len(ypred[i])):
                        word = id2word[int(X[i, j])]
                        tag = id2tag[int(ypred[i, j])]
                        f.write(f"{word}\t{tag}\n")
                    f.write('\n')

In [85]:
out_predictions(model, loader_test, 'predictions/gold_bilstm_word3.txt')

In [86]:
# SPAN-F1 SCORE

def readBIO(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[1])
    return ents

def toSpans(tags):
    spans = set()
    for beg in range(len(tags)):
        if tags[beg][0] == 'B':
            end = beg
            for end in range(beg+1, len(tags)):
                if tags[end][0] != 'I':
                    break
            spans.add(str(beg) + '-' + str(end) + ':' + tags[beg][2:])
            #print(end-beg)
    return spans

def getInstanceScores(predPath, goldPath):
    goldEnts = readBIO(goldPath)
    predEnts = readBIO(predPath)
    entScores = []
    tp = 0
    fp = 0
    fn = 0
    for goldEnt, predEnt in zip(goldEnts, predEnts):
        goldSpans = toSpans(goldEnt)
        predSpans = toSpans(predEnt)
        overlap = len(goldSpans.intersection(predSpans))
        tp += overlap
        fp += len(predSpans) - overlap
        fn += len(goldSpans) - overlap
        
    prec = 0.0 if tp+fp == 0 else tp/(tp+fp)
    rec = 0.0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0.0 if prec+rec == 0.0 else 2 * (prec * rec) / (prec + rec)
    return f1

In [87]:
pred = 'predictions/gold_bilstm_word3.txt'
gold = 'data/gold.txt'

In [88]:
score = getInstanceScores(pred,gold)
print('Span-F1 score - word-biLSTM ', score)

Span-F1 score - word-biLSTM  0.7551676619200735


#### NOISE

In [89]:
def load_test_data(datapath):
    sentences = []
    tags = []
    with open(datapath) as f:
        lines = f.readlines()
        sentence = []
        tag = []
        for line in lines:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # If the line is not empty
                word, tag_label = line.split('\t')
                if vocab is not None:
                    if word in vocab.keys():
                        sentence.append(vocab[word])
                    else:
                        sentence.append(vocab['<oov>'])
                if nertags is not None:
                    tag.append(nertags[tag_label])
            else:  # If the line is empty, indicating end of a sentence
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag)
                    sentence = []
                    tag = []

    # Padding the sentences at the end
    max_length = max(len(x) for x in sentences)
    x_lengths = [len(x) for x in sentences]
    X_test = []
    Y_test = []
    for sent, tag in zip(sentences, tags):
        length_to_append = max_length - len(sent)
        X_test.append(sent + [0] * length_to_append)  # Padding with zeros
        Y_test.append(tag + [0] * length_to_append)  # Padding with zeros

    X_test = torch.Tensor(X_test)
    Y_test = torch.Tensor(Y_test)
    x_lengths = torch.Tensor(x_lengths)

    return X_test, Y_test, x_lengths


In [90]:
noise_rates = {
    'capitalization_swap': [0.1, 0.15, 0.2, 0.25, 0.3], 
    'character_swap': [0.1, 0.15, 0.2, 0.25, 0.3],
    'character_removal': [0.1, 0.15, 0.2, 0.25, 0.3],
    'character_replacement': [0.1, 0.15, 0.2, 0.25, 0.3]
}

In [91]:
def loader_test(testdatapath):
    Xtest, Ytest, x_testlengths = load_test_data(testdatapath)

    testdataset = TensorDataset(Xtest, Ytest, x_testlengths)
    loader_test = DataLoader(testdataset, batch_size=1, shuffle=False)
    return loader_test

In [92]:
noise_types = []
noise_rate = []
f1_scores = []

for noise_type in noise_rates.keys():
    for rate in noise_rates[noise_type]:
        my_loader = loader_test(f'data/altered/{noise_type}_rate_{rate}.txt')
        out_predictions(model, my_loader, f'predictions/word3/{noise_type}_rate_{rate}.txt')
        score = getInstanceScores(f'predictions/word3/{noise_type}_rate_{rate}.txt', 'data/gold.txt')
        
        # Append data to lists
        noise_types.append(noise_type)
        noise_rate.append(rate)
        f1_scores.append(score)

data = {'Noise Type': noise_types, 'Noise Rate': noise_rate, 'F1 Score': f1_scores}
df = pd.DataFrame(data)
print(df)

               Noise Type  Noise Rate  F1 Score
0     capitalization_swap        0.10  0.690082
1     capitalization_swap        0.15  0.652507
2     capitalization_swap        0.20  0.628815
3     capitalization_swap        0.25  0.598346
4     capitalization_swap        0.30  0.569720
5          character_swap        0.10  0.715556
6          character_swap        0.15  0.549860
7          character_swap        0.20  0.381871
8          character_swap        0.25  0.318514
9          character_swap        0.30  0.304601
10      character_removal        0.10  0.710736
11      character_removal        0.15  0.520597
12      character_removal        0.20  0.317655
13      character_removal        0.25  0.259942
14      character_removal        0.30  0.259981
15  character_replacement        0.10  0.714078
16  character_replacement        0.15  0.520380
17  character_replacement        0.20  0.317993
18  character_replacement        0.25  0.251981
19  character_replacement        0.30  0

In [93]:
df.to_csv('out/df_noise_word3.csv', index=False)

In [72]:
# Save the trained model's state dictionary
torch.save(model.state_dict(), 'models/bilstm_word/model_bilst_word.pth')

# Save the dictionaries
with open('models/bilstm_word/id2word.pkl', 'wb') as f:
    pickle.dump(id2word, f)
with open('models/bilstm_word/id2tag.pkl', 'wb') as f:
    pickle.dump(id2tag, f)
with open('models/bilstm_word/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
with open('models/bilstm_word/nertags.pkl', 'wb') as f:
    pickle.dump(nertags, f)

#### Altered_2

In [96]:
noise_rates = {
    'capitalization_swap': [0, 0.1, 0.2, 0.3, 0.4],  # Adjust the rates as needed
    'character_swap': [0, 0.05, 0.075, 0.1, 0.125, 0.15, 0.17, 0.2],
    'character_removal': [0, 0.05, 0.075, 0.1, 0.125, 0.15, 0.17, 0.2],
    'character_replacement': [0, 0.05, 0.075, 0.1, 0.125, 0.15, 0.17, 0.2]
}

In [97]:
noise_types = []
noise_rate = []
f1_scores = []

for noise_type in noise_rates.keys():
    for rate in noise_rates[noise_type]:
        my_loader = loader_test(f'data/altered_2/{noise_type}_rate_{rate}.txt')
        out_predictions(model, my_loader, f'predictions/altered_2/word/{noise_type}_rate_{rate}.txt')
        score = getInstanceScores(f'predictions/altered_2/word/{noise_type}_rate_{rate}.txt', 'data/gold.txt')
        
        # Append data to lists
        noise_types.append(noise_type)
        noise_rate.append(rate)
        f1_scores.append(score)

data = {'Noise Type': noise_types, 'Noise Rate': noise_rate, 'F1 Score': f1_scores}
df = pd.DataFrame(data)
print(df)

               Noise Type  Noise Rate  F1 Score
0     capitalization_swap       0.000  0.755168
1     capitalization_swap       0.100  0.691408
2     capitalization_swap       0.200  0.620485
3     capitalization_swap       0.300  0.571056
4     capitalization_swap       0.400  0.509348
5          character_swap       0.000  0.755168
6          character_swap       0.050  0.755168
7          character_swap       0.075  0.754571
8          character_swap       0.100  0.715233
9          character_swap       0.125  0.636414
10         character_swap       0.150  0.552861
11         character_swap       0.170  0.467537
12         character_swap       0.200  0.383893
13      character_removal       0.000  0.755168
14      character_removal       0.050  0.755168
15      character_removal       0.075  0.754641
16      character_removal       0.100  0.711462
17      character_removal       0.125  0.616622
18      character_removal       0.150  0.521275
19      character_removal       0.170  0

In [98]:
df.to_csv('out/altered_2_word.csv', index=False)