In [1]:
import numpy as np
from collections import Counter
import nltk
from functools import reduce

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import matplotlib.pyplot as plt

#"/content/drive/MyDrive/CS505_final_drive/data/train.csv"

trainingSet = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/train.csv")

print("train.csv shape is ", trainingSet.shape)


train.csv shape is  (139753, 9)


# First Look at the dataset

In [4]:
trainingSet.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score
0,195370,1890228583,A3VLX5Z090RQ0V,1,2,1030838400,An Unexplained Anime Review,I was very anxious to see the Uncut version of...,2.0
1,1632470,B00BEIYSL4,AUDXDMFM49NGY,0,1,1405036800,not great.,Movie was okay...not great.,3.0
2,9771,0767809335,A3LFIA97BUU5IE,3,36,983750400,Technical problem with this DVD,"Like the Dinosaur Collector's Edition DVD, thi...",1.0
3,218855,6300215792,A1QZM75342ZQVQ,1,1,1394841600,Heeeeyyyyy LAAAAADEEE!!!!,"Come on, now..... this has to be, by far, the...",5.0
4,936225,B000B5XOZW,ANM2SCEUL3WL1,1,1,1163721600,Herzog the Great Traveler of both natural and ...,I've always been a great admirer of Herzog's o...,4.0


### Checking if there are null values in the relevant columns. For this task, the relevant columns are "Summary", "Text", and "Score".

In [8]:
df=trainingSet[["Id", "Summary", "Text", "Score"]]

In [7]:
df.isna().any()

Id         False
Summary     True
Text        True
Score       True
dtype: bool

In [8]:
df["Summary"].isna().value_counts()

False    139752
True          1
Name: Summary, dtype: int64

In [9]:
df["Text"].isna().value_counts()

False    139752
True          1
Name: Text, dtype: int64

In [10]:
df["Score"].isna().value_counts()

False    122283
True      17470
Name: Score, dtype: int64

In [11]:
df[df["Summary"].isna()==True]

Unnamed: 0,Id,Summary,Text,Score
105624,1009122,,"I saw the DVD in the store, so I picked it up....",1.0


In [12]:
df[df["Text"].isna()==True]

Unnamed: 0,Id,Summary,Text,Score
65597,251331,Bumbling cops + super-lucky criminals = funny ...,,4.0


### Since there are only 1 null values in "Summary" and "Text" columns, it wouldn't be so necessary to work with imputation techniques. We will just drop them.

### And for the null values in the "Score" column, we thought we could just drop them as well. This is because we have still 122283 rows with non-null score values which are still plentiful to work with.

In [9]:
df= df.dropna()

In [8]:
df.isna().any()

Id         False
Summary    False
Text       False
Score      False
dtype: bool

## Exploring the dataset

### We first observed if the dataset is balanced or not:

In [7]:
df["Score"].value_counts()

5.0    65313
4.0    27817
3.0    14482
1.0     7360
2.0     7309
Name: Score, dtype: int64

### Dataset is highly imbalanced, escpecially for scores 5 and 4 .

WE HAVE THREE OPTIONS: OVERSAMPLE, UNDERSAMPLE, OR LEAVE AT IT IS.

OVERSAMPLE: We can use language models to generate texts for undersampled classes. (Synthetic data generation). Language models can be BERT and GPT-2. But since the gap between the number of undersampled class and oversampled class is huge, we can create synthetic data for some extend.

UNDERSAMPLE: We can just drop the rows of oversampled class. And we would have relatively balanced dataset.

But regardless of doing either of them, we need to process the text first!!!

## Text Processing

Instead of dealing with summary and the review text seperately we can merge these two columns. This is because summary texts don't include to much information by itself. To show this:

In [6]:
summary_text=df["Summary"].to_list()
summary_text_lens=list(map(lambda x: len(x.split(" ")), summary_text))

print(f"The average word length of summary text is {sum(summary_text_lens)/len(summary_text)}")

The average word length of summary text is 4.849232505458739


In [10]:
df["Merged_Text"]= df["Summary"]+" "+df["Text"]

In [6]:
df["Merged_Text"]

0         An Unexplained Anime Review I was very anxious...
1                    not great. Movie was okay...not great.
2         Technical problem with this DVD Like the Dinos...
3         Heeeeyyyyy LAAAAADEEE!!!! Come on, now.....  t...
4         Herzog the Great Traveler of both natural and ...
                                ...                        
139748    Toby Stephens IS Mr. Rochester. I kept remindi...
139749    The Cold Reality The portrayal of high school ...
139750    Great deal Everything as promised.  Fun for al...
139751    Great movie marred by the quality of this VHS ...
139752    Hilarious look at the view of Catholic religio...
Name: Merged_Text, Length: 139753, dtype: object

In [11]:
texts=df["Merged_Text"].to_list()

text=' '.join(texts)
text[:4000]

"An Unexplained Anime Review I was very anxious to see the Uncut version of Kite, or kee-tay as I called it, and I finally manage to see that Anime, but when I see the entire movie...man, it was very hard to explain.  First off, the plot is very questionable, this movie has got to be the first I've ever seen that deals with a lot of...'DEPTH'. There are still many things in this movie that I still don't understand:  What's the real reason that Akai wants Sawa to assasinate certain innocent people, why can Sawa just have the heart to decide not to kill those people and find a solution to that, and mainly, why Akai killed Sawa's parents?  There are many more questions unanswered, but I don't want to spoil it to the people, all I can say is that none of this makes sense, except for the fact that Akai is a perverted, child-raping nutcase! The animation and characters in this anime is good, but not at a same level as 'Ghost In The Shell', and everything else..just forget it!Overall...this i

First, we can lower case the text:

In [29]:
text=text.lower()

We can get rid of punctuation marks. Actually the use of punctuation marks may contain relevant information about the nature of the text. But in this project our main aim is to probe the performance of models in the task of text classification.

In [7]:
import re


In [30]:

text=re.sub('[!"#$%&\(\)*+.,;<=>?@^_\[\]{|}~]+', '',text)
text=re.sub('[/:]', ' ',text)
text=re.sub(' - ', ' ',text)
text=re.sub('- ', ' ',text)
text=re.sub(' -', ' ',text)
text=re.sub('--', ' ',text)
text=re.sub('\s+', ' ', text)
text=re.sub('(\d+\S*)', '', text)
text=re.sub('\S+\d+', '', text)
text=re.sub(r'(.)\1{2,}', r'\1', text)

text=re.sub(r"('\S+')", r'\1', text)

In [31]:
text[:1000]

"an unexplained anime review i was very anxious to see the uncut version of kite or kee-tay as i called it and i finally manage to see that anime but when i see the entire movieman it was very hard to explain first off the plot is very questionable this movie has got to be the first i've ever seen that deals with a lot of'depth' there are still many things in this movie that i still don't understand what's the real reason that akai wants sawa to assasinate certain innocent people why can sawa just have the heart to decide not to kill those people and find a solution to that and mainly why akai killed sawa's parents there are many more questions unanswered but i don't want to spoil it to the people all i can say is that none of this makes sense except for the fact that akai is a perverted child-raping nutcase the animation and characters in this anime is good but not at a same level as 'ghost in the shell' and everything elsejust forget itoverallthis is very hard to give the final resul

### Lemmatization:

In [32]:
text=re.sub("'ve"," have",text)
text=re.sub("'t"," not",text)
text=re.sub("'ll"," will",text)
text=re.sub("'d"," would",text)
text=re.sub("'re"," are",text)
text=re.sub("'m"," am",text)
text=re.sub("'s","",text)

In [33]:
text[:1000]

"an unexplained anime review i was very anxious to see the uncut version of kite or kee-tay as i called it and i finally manage to see that anime but when i see the entire movieman it was very hard to explain first off the plot is very questionable this movie has got to be the first i have ever seen that deals with a lot of wouldepth' there are still many things in this movie that i still don not understand what the real reason that akai wants sawa to assasinate certain innocent people why can sawa just have the heart to decide not to kill those people and find a solution to that and mainly why akai killed sawa parents there are many more questions unanswered but i don not want to spoil it to the people all i can say is that none of this makes sense except for the fact that akai is a perverted child-raping nutcase the animation and characters in this anime is good but not at a same level as 'ghost in the shell' and everything elsejust forget itoverallthis is very hard to give the final

In [34]:
text=re.sub(" am ", " be ",text)
text=re.sub(" is ", " be ",text)
text=re.sub(" are ", " be ",text)
text=re.sub(" was ", " be ",text)
text=re.sub(" were ", " be ",text)
text=re.sub(" being ", " be ",text)
text=re.sub(" been ", " be ",text)

In [35]:
text=re.sub("'", "",text)

In [36]:
text[:1000]

'an unexplained anime review i be very anxious to see the uncut version of kite or kee-tay as i called it and i finally manage to see that anime but when i see the entire movieman it be very hard to explain first off the plot be very questionable this movie has got to be the first i have ever seen that deals with a lot of wouldepth there be still many things in this movie that i still don not understand what the real reason that akai wants sawa to assasinate certain innocent people why can sawa just have the heart to decide not to kill those people and find a solution to that and mainly why akai killed sawa parents there be many more questions unanswered but i don not want to spoil it to the people all i can say be that none of this makes sense except for the fact that akai be a perverted child-raping nutcase the animation and characters in this anime be good but not at a same level as ghost in the shell and everything elsejust forget itoverallthis be very hard to give the final result

In [37]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stp_wrds=stopwords.words('english')

for word in stp_wrds:
    text=re.sub(f" {word} ", " ",text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [38]:
text[:1000]

'an unexplained anime review anxious see uncut version kite kee-tay called finally manage see anime see entire movieman hard explain first plot questionable movie got first ever seen deals lot wouldepth still many things movie still understand real reason akai wants sawa assasinate certain innocent people sawa heart decide kill people find solution mainly akai killed sawa parents many questions unanswered want spoil people say none makes sense except fact akai perverted child-raping nutcase animation characters anime good level ghost shell everything elsejust forget itoverallthis hard give final result kite uncensored watched many times whole excessive-meaningless gory violence whole child pornography anime still help understand give  stars becausewell know movie much great movie okaynot great technical problem dvd like dinosaur collector edition dvd one play pioneer elite dv- player runs  minutes stops hey lade come far best jerry lewis movie made full  comedy gags ridiculous situatio

Now, we will create a function that does the text processing in above.

In [8]:
def text_processing(text):


    text=text.lower()

    text=re.sub('[!"#$%&\(\)*+.,;<=>?@^_\[\]{|}~]+', '',text)
    text=re.sub('[/:]', ' ',text)
    text=re.sub(' - ', ' ',text)
    text=re.sub('- ', ' ',text)
    text=re.sub(' -', ' ',text)
    text=re.sub('--', ' ',text)
    text=re.sub('\s+', ' ', text)
    text=re.sub('(\d+\S*)', '', text)
    text=re.sub('\S+\d+', '', text)
    text=re.sub(r'(.)\1{2,}', r'\1', text)

    text=re.sub(r"('\S+')", r'\1', text)


    text=re.sub("'ve"," have",text)
    text=re.sub("'t"," not",text)
    text=re.sub("'ll"," will",text)
    text=re.sub("'d"," would",text)
    text=re.sub("'re"," are",text)
    text=re.sub("'m"," am",text)
    text=re.sub("'s","",text)

    text=re.sub(" am ", " be ",text)
    text=re.sub(" is ", " be ",text)
    text=re.sub(" are ", " be ",text)
    text=re.sub(" was ", " be ",text)
    text=re.sub(" were ", " be ",text)
    text=re.sub(" being ", " be ",text)
    text=re.sub(" been ", " be ",text)

    text=re.sub("'", "",text)

    return text


Since large language models that we are going to use like BERT and GPT-2 don't require all the text processing above, we wrote a special text processing function that makes only necessary text processing. By saying "necessary" cleaning, we meant getting rid of the akward texts like "&#34;type-casting&#34;...... heh)"

In [9]:
def text_processing_lm(text):

    text=re.sub('["#$%&\(\)*+<=>@^_\[\]{|}~]+', '',text)
    text=re.sub('[/:]', ' ',text)
    text=re.sub(' - ', ' ',text)
    text=re.sub('- ', ' ',text)
    text=re.sub(' -', ' ',text)
    text=re.sub('--', ' ',text)
    text=re.sub('\s+', ' ', text)
    text=re.sub('(\d+\S*)', '', text)
    text=re.sub('\S+\d+', '', text)
    text=re.sub(r'(.)\1{2,}', r'\1', text)

    text=re.sub(r"('\S+')", r'\1', text)

    return text


Since we are going to oversample the minority classes which are 1.0 and 2.0, we don't need to perform text processing for all rows.

In [11]:
df_textprocessed_1=df[df["Score"]==1.0]

In [12]:
class1_texts=df_textprocessed_1["Merged_Text"].tolist()

In [13]:
class1_texts_processed=list(map(text_processing_lm, class1_texts))

In [14]:
df_class1=pd.DataFrame()

df_class1["Id"]=df_textprocessed_1["Id"]
df_class1["Text"]=class1_texts_processed

In [14]:
df_class1.head()

Unnamed: 0,Id,Text
2,9771,Technical problem with this DVD Like the Dinos...
22,1205441,move on I fast forward through chunks of this ...
23,922828,"If you read the Koontz book, you will absolute..."
48,878469,"WTF WTF, Amazon, this is NOT the full movie, i..."
59,910248,"I really tried, but this is just awful. Colin ..."


In [15]:
df_textprocessed_2=df[df["Score"]==2.0]

class2_texts=df_textprocessed_2["Merged_Text"].tolist()

class2_texts_processed=list(map(text_processing_lm, class2_texts))



In [16]:
df_class2=pd.DataFrame()

df_class2["Id"]=df_textprocessed_2["Id"]
df_class2["Text"]=class2_texts_processed

In [19]:
df_class2.head()

Unnamed: 0,Id,Text
0,195370,An Unexplained Anime Review I was very anxious...
6,1213850,Disappointing Night Saw this film recently on ...
12,280462,It was just okay. The end of the film has mode...
29,1002289,Not my pick on a good pilates workout I would ...
34,1393851,"Has it's share of laughs, but goes overboard w..."


In [None]:
df_class1.to_csv("/Users/canerozer/Desktop/BU/FALL2023/CS505/Final_Project/data/df_class1.csv", index=False)
df_class2.to_csv("/Users/canerozer/Desktop/BU/FALL2023/CS505/Final_Project/data/df_class2.csv", index=False)




## Oversampling

### Fine-tuned model's perplexity:



In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

In [22]:
config=GPT2Config.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch2')

gpt2_class1_model_epoch2 = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch2', config=config)



In [23]:
config=GPT2Config.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch4')

gpt2_class1_model_epoch4 = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch4', config=config)



In [24]:
tokenizer1= GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_tokenizer_class1_epoch2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
tokenizer2= GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_tokenizer_class1_epoch4')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
from datasets import load_dataset
import torch
from tqdm import tqdm


encodings1 = tokenizer1("\n\n".join(class1_texts_processed), return_tensors="pt")
#encodings2 = tokenizer2("\n\n".join(class2_texts_processed), return_tensors="pt")
max_length = gpt2_class1_model_epoch4.config.n_positions
stride = 512


def ppl(model, input_ids_all, stride):
    nlls = []
    for i in tqdm(range(0, input_ids_all.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, input_ids_all.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = input_ids_all[:, begin_loc:end_loc].to("cuda:0")
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl



Token indices sequence length is longer than the specified maximum sequence length for this model (1470937 > 1024). Running this sequence through the model will result in indexing errors


In [27]:
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2').cuda()

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### For class 1.0:

with epoch=4:

In [35]:
print(ppl(gpt2, encodings1.input_ids, stride))

100%|██████████| 2873/2873 [05:12<00:00,  9.21it/s]


tensor(46.4783, device='cuda:0')


In [37]:
print(ppl(gpt2_class1_model_epoch4.cuda(), encodings1.input_ids, stride))

100%|██████████| 2873/2873 [05:23<00:00,  8.89it/s]

tensor(22.9563, device='cuda:0')





with epoch=2:

In [28]:
print(ppl(gpt2_class1_model_epoch2.cuda(), encodings1.input_ids, stride))

100%|██████████| 2873/2873 [05:09<00:00,  9.28it/s]


tensor(45.5674, device='cuda:0')


The perplexity of pre-trained (but not fine-tuned) gpt2 model's perplexity with class 1.0 texts is 46.4783.

The perplexity of the model that I fine-tuned gpt2 with class 1.0 texts with 4 epochs is 22.9563.

The perplexity of the model that I fine-tuned gpt2 with class 1.0 texts with 2 epochs is 45.5674.

Therefore, the model that I fine-tuned with 2 epochs creates more or less the same text with pre-trained gpt2 model. But the model that I fine-tuned gpt2 with 4 epochs has a better performance than other models.

#### According to perplexity metric, the gpt2 model that I fine-tuned with 4 epochs creates more realistic texts. Thus, I will use models that I fine-tuned with 4 epochs to create synthetic data (oversampling) from now on.

## Generating Texts:

### I will oversample the minority classes such as 1.0 and 2.0. I will create new samples such that the minority classes would have 10.000 samples at the end.

In [19]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [20]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


### For class 2.0:

In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

In [18]:
#'/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4'

config=GPT2Config.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4')

gpt2_class2_model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4', config=config)

#gpt2_class2_model.cuda()



In [19]:
#'/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4'

gpt2_class2_tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_tokenizer_class2_epoch4', config=config)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
import torch

When I tried to generate texts with max_length=500, it takes too much time. Because the average text length for class 2.0 is 168????, I can set max_length to 200.

In [23]:
gpt2_class2_model.eval()


prompt = "<|startoftext|>"

generated = torch.tensor(gpt2_class2_tokenizer.encode(prompt)).unsqueeze(0)
#generated = generated.to(device)


sample_outputs = gpt2_class2_model.generate(
                                generated,
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=50,
                                max_length = 200,
                                top_p=0.95,
                                num_return_sequences= 1016  #2691-1675
                                )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [24]:
class2_generated_df2= pd.DataFrame()

In [25]:
class2_generated_texts=[gpt2_class2_tokenizer.decode(sample_output, skip_special_tokens=True) for sample_output in sample_outputs]

In [26]:
class2_generated_df2["Generated_Texts"]=class2_generated_texts

In [37]:
#for i, sample_output in enumerate(sample_outputs):
#  print("{}: {}\n\n".format(i, gpt2_class2_tokenizer.decode(sample_output, skip_special_tokens=True)))

0: The only good thing about this movie is that the whole movie is actually a bunch of puppies stretched out into the  I thought it was supposed to be longer than it was, but there's really nothing that is in the movie for me to take it seriously, and that is how I would rate this movie in my book. If they really want you to believe that there is a difference between a person who sleeps through a hundred puppies and people who wake up sleeping in front of a computer with all that puppies in it then give it a pass.The actual puppies themselves are ok, and even so, you will find out that they sleep through pretty much everyone in the movie.The biggest problem is you just cannot take it seriously. The actual characters' motives and dialogue are extremely lame, and there's very little in the film that makes it stand out. I gave it a D and the F was the least interesting part because the lead male character did much of the writing. His love for the


1: Not One of My Favorites At the time o

In [27]:
class2_generated_df2.to_csv('/content/drive/MyDrive/CS505_final_drive/data/class2_generated_df2.csv', index=False)


### For class 1.0

In [23]:
#'/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4'

config=GPT2Config.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch4')

gpt2_class1_model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class1_epoch4', config=config)



In [24]:
#'/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_textgen_class2_epoch4'

gpt2_class1_tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/CS505_final_drive/final_project_gpt2_tokenizer_class1_epoch4', config=config)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
gpt2_class1_model.eval()


prompt = "<|startoftext|>"

generated = torch.tensor(gpt2_class1_tokenizer.encode(prompt)).unsqueeze(0)
#generated = generated.to(device)


sample_outputs = gpt2_class1_model.generate(
                                generated,
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=50,
                                max_length = 200,
                                top_p=0.95,
                                num_return_sequences=1391 #2691-1300
                                )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [26]:
class1_generated_df1= pd.DataFrame()

In [27]:
class1_generated_texts=[gpt2_class1_tokenizer.decode(sample_output, skip_special_tokens=True) for sample_output in sample_outputs]

In [28]:
class1_generated_df1["Generated_Texts"]=class1_generated_texts

In [29]:
class1_generated_df1.to_csv('/content/drive/MyDrive/CS505_final_drive/data/class1_generated_df1.csv', index=False)


## Loading the generated texts

With the computers we have, it was not possible to generate all the texts at once. This is because we have limited RAM in our computers. So instead generating all the texts at one time, we generated the some portion of it and then store it into our local computer to merge them later on.

In [10]:
class1_generated_df = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/class1_generated_df.csv")


In [11]:
class1_generated_df1 = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/class1_generated_df1.csv")


In [12]:
class1_generated_all_df=pd.concat([class1_generated_df, class1_generated_df1], axis=0,  ignore_index=True)


In [13]:
len0, _ = class1_generated_all_df.shape

In [14]:
class1_generated_all_df["Score"]=[1.0]*len0

In [15]:
class1_generated_all_df.head()

Unnamed: 0,Generated_Texts,Score
0,Very Disappointed I really like and admire the...,1.0
1,I have never been more excited about a good re...,1.0
2,"Horrible Movie I really hate to say it, but th...",1.0
3,I Love Anime..but I Don't Know How This Film W...,1.0
4,Horrible movie! Horrible movie! I rented this ...,1.0


In [16]:
class1_generated_all_df= class1_generated_all_df.rename(columns={"Generated_Texts": "Merged_Text"})


In [17]:
class1_generated_all_df.columns

Index(['Merged_Text', 'Score'], dtype='object')

In [18]:
class2_generated_df = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/class2_generated_df.csv")

In [19]:
class2_generated_df1 = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/class2_generated_df1.csv")

In [20]:
class2_generated_df2 = pd.read_csv("/content/drive/MyDrive/CS505_final_drive/data/class2_generated_df2.csv")

In [21]:
class2_generated_all_df=pd.concat([class2_generated_df, class2_generated_df1, class2_generated_df2], axis=0,  ignore_index=True)


In [22]:
len0, _= class2_generated_all_df.shape
len0

2691

In [23]:
class2_generated_all_df["Score"]=[2.0]*len0

In [88]:
class2_generated_all_df.head()

Unnamed: 0,Generated_Texts,Score
0,A film of potential greatness One wonders at o...,2.0
1,"Not funny When I first watched this movie, I t...",2.0
2,The movie is weak I'm rating this movie based ...,2.0
3,Don't bother. It isn't scary at all. I enjoyed...,2.0
4,What happened? What happened? The story involv...,2.0


In [24]:
class2_generated_all_df= class2_generated_all_df.rename(columns={"Generated_Texts": "Merged_Text"})


In [27]:
class2_generated_all_df.columns

Index(['Merged_Text', 'Score'], dtype='object')

### Preparing the final dataset

In [25]:
df_final_class1=df[df['Score']==1.0]
df_final_class2=df[df['Score']==2.0]
df_final_class3=df[df['Score']==3.0]
df_final_class4=df[df['Score']==4.0]
df_final_class5=df[df['Score']==5.0]


Recalling number of samples in each class:

In [23]:
df["Score"].value_counts()


5.0    65313
4.0    27817
3.0    14482
1.0     7360
2.0     7309
Name: Score, dtype: int64

We trimmed the majority classes to sample size of 10k:

In [26]:
df_final_class3=df_final_class3.sample(n=10000, random_state=42)
df_final_class4=df_final_class4.sample(n=10000, random_state=42)
df_final_class5=df_final_class5.sample(n=10000, random_state=42)

In [28]:
df_final_oversample_class1=pd.concat([class1_generated_all_df,df_final_class1], axis=0, ignore_index=True)

df_final_oversample_class2=pd.concat([class2_generated_all_df, df_final_class2], axis=0, ignore_index=True)



In [94]:
df_final_oversample_class1.head()

Unnamed: 0,Merged_Text,Score,Id,Summary,Text
0,Very Disappointed I really like and admire the...,1.0,,,
1,I have never been more excited about a good re...,1.0,,,
2,"Horrible Movie I really hate to say it, but th...",1.0,,,
3,I Love Anime..but I Don't Know How This Film W...,1.0,,,
4,Horrible movie! Horrible movie! I rented this ...,1.0,,,


In [110]:
df_final_oversample_class1.shape

(10051, 5)

In [111]:
df_final_oversample_class2.shape

(10000, 5)

In [29]:
df_final_oversample_class1= df_final_oversample_class1.sample(n=10000, random_state=42)

In [30]:
df_final_oversample=pd.concat([df_final_oversample_class1, df_final_oversample_class2, df_final_class3, df_final_class4, df_final_class5], axis=0, ignore_index=True)


In [116]:
df_final_oversample.shape

(50000, 5)

In [31]:
df_final_oversample= df_final_oversample.sample(frac=1.0, random_state=42)

In [32]:
df_final_oversample=df_final_oversample[["Merged_Text", "Score"]]

In [33]:
df_final_oversample.head()

Unnamed: 0,Merged_Text,Score
33553,"Not as good as it could've been, but still ver...",4.0
9427,Not my kind of show My roommate has been forci...,1.0
199,Terrible Movie I cannot believe people think t...,1.0
12447,"Good movie, bad movie Why do we need a movie l...",2.0
39489,A classic only because it was a first Tomatoes...,4.0


In [122]:
df_final_oversample.to_csv('/content/drive/MyDrive/CS505_final_drive/data/df_final_oversample.csv', index=False)


In [34]:
df_final_oversample["Merged_Text"]=df_final_oversample["Merged_Text"].apply(text_processing)


In [44]:
df_final_oversample["Score"]=df_final_oversample["Score"].apply(int)-1

In [45]:
df_final_oversample.head()

Unnamed: 0,Merged_Text,Score
33553,not as good as it could have be but still very...,3
9427,not my kind of show my roommate has be forcing...,0
199,terrible movie i cannot believe people think t...,0
12447,good movie bad movie why do we need a movie li...,1
39489,a classic only because it be a first tomatoes ...,3


In [46]:
df_final_oversample.to_csv('/content/drive/MyDrive/CS505_final_shared_folder/df_final_oversample.csv', index=False)

In [27]:
df_final=pd.concat([df_final_class1, df_final_class2, df_final_class3, df_final_class4, df_final_class5], axis=0, ignore_index=True)



In [105]:
df_final.shape

(44669, 5)

In [28]:
df_final= df_final.sample(frac=1.0, random_state=42)

In [29]:
df_final=df_final[["Id", "Merged_Text", "Score"]]

In [40]:
df_final.head()

Unnamed: 0,Id,Merged_Text,Score
43494,1396060,Great Movie about Survival & Courage Not an en...,5.0
34646,20288,"Great zombie movies, no more and no less. I fe...",4.0
12389,1559715,Actors need work like everyone else. Actors ne...,2.0
44012,517490,A love that can never be This movie is beautif...,5.0
8569,780023,Crazy Nonsense I thought I would have liked it...,2.0


In [123]:
df_final.to_csv('/content/drive/MyDrive/CS505_final_drive/data/df_final.csv', index=False)


In [30]:
df_final["Merged_Text"]=df_final["Merged_Text"].apply(text_processing)


In [31]:
df_final["Score"]=df_final["Score"].apply(int)-1

In [48]:
df_final.head()

Unnamed: 0,Id,Merged_Text,Score
43494,1396060,great movie about survival courage not an enti...,4
34646,20288,great zombie movies no more and no less i feel...,3
12389,1559715,actors need work like everyone else actors nee...,1
44012,517490,a love that can never be this movie be beautif...,4
8569,780023,crazy nonsense i thought i would have liked it...,1


In [32]:
df_final.to_csv('/content/drive/MyDrive/CS505_final_shared_folder/df_final.csv', index=False)