In [1]:
!python3 -V

Python 3.8.16


In [2]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
#Remember to use num_workers=0 when creating the DataBunch.

In [3]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [4]:
!ls /content/drive/MyDrive/YelpDataset/dataframes

AutoDF.pkl		     HealthMedicalDF.pkl  restaurantDF.pkl
beautyandSpaDF.pkl	     HomeServicesDF.pkl   shoppingDF.pkl
EventPlanningServicesDF.pkl  LocalServicesDF.pkl
foodDF.pkl		     NightlifeDF.pkl


In [5]:
%cp /content/drive/MyDrive/YelpDataset/dataframes/restaurantDF.pkl /content/

In [6]:
import pandas as pd
df = pd.read_pickle('restaurantDF.pkl')

In [7]:
df = df.rename(columns={'text': 'reviewText'})
df = df.rename(columns={'stars': 'overall'})
df = df.rename(columns={'categories': 'category'})

In [8]:
df = df[['business_id','name','city','state','overall','category','reviewText']]

In [9]:
df.head(1)

Unnamed: 0,business_id,name,city,state,overall,category,reviewText
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,Restaurants,This is nice little Chinese bakery in the hear...


In [10]:
df['sentenceLength'] = df['reviewText'].str.count(' ') + 1

In [11]:
df.shape

(3773770, 8)

In [12]:
N_OF_WORDS_IN_REVIEW = 200

In [13]:
df = df[df['sentenceLength']<=N_OF_WORDS_IN_REVIEW]

In [14]:
df.shape

(3374443, 8)

In [15]:
df.head(2)

Unnamed: 0,business_id,name,city,state,overall,category,reviewText,sentenceLength
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,Restaurants,This is nice little Chinese bakery in the hear...,70
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,Restaurants,This is the bakery I usually go to in Chinatow...,116


In [25]:
sample_df = df.sample(frac=0.01, random_state=2021).dropna().reset_index()

In [26]:
sample_df.shape

(33744, 9)

In [None]:
del df
#gc.collect()

In [27]:
#import/install gpt3 model
!pip install --upgrade openai
import openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.25.0.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 3.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.5.2.221213-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 26.7 MB/s 
Collecting types-pytz>=2022.1.1
  Downloading types_pytz-2022.6.0.1-py3-none-any.whl (4.7 kB)
Building wheels for collected packages: openai
  Building wheel for openai (PEP 517) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.25.0-py3-none-any.whl size=55880 sha256=a74da832653277f677561bbafe2fc0282b753b0be145cbedeacd4f465ee61d4e
  Stored in directory: /root/.cache/pip/wheels/4b/92/33/6f57c7aae0b16875267999a50570e81f15eecec577ebe

In [28]:
!openai -k "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" api fine_tunes.list

{
  "data": [
    {
      "created_at": 1670396255,
      "fine_tuned_model": "ada:ft-new-york-university-2022-12-07-06-58-34",
      "hyperparams": {
        "batch_size": 1,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 1,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-thRZbmgrRzU1HkLYQDEaFvyr",
      "model": "ada",
      "object": "fine-tune",
      "organization_id": "org-xW0Dhs9vleFDuwl6YbhZdKPz",
      "result_files": [
        {
          "bytes": 5168,
          "created_at": 1670396315,
          "filename": "compiled_results.csv",
          "id": "file-IFdzZTA8YAPuRl3AseUdPLPg",
          "object": "file",
          "purpose": "fine-tune-results",
          "status": "processed",
          "status_details": null
        }
      ],
      "status": "succeeded",
      "training_files": [
        {
          "bytes": 36459,
          "created_at": 1670396255,
          "filename": "/content/DataForGPT3_prepared.jsonl",
          "id": "file-xtiaVssGfp

In [29]:
openai.api_key = "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"

### Generated examples of Restaurant

In [30]:
#load the 5500sample finetune gpt3 model
MODEL ="ada:ft-new-york-university-2022-12-11-05-52-26"

In [31]:

N_INITIAL_WORDS = 5
LEN_SAMPLE_DF = sample_df.shape[0]
N_REVIEWS = 500
BINS = list(range(10,150))

In [32]:
import warnings
import numpy as np
import time

warnings.filterwarnings(action="ignore")
from sklearn.model_selection import train_test_split
generated_df_rows = []
for i in range(len(BINS)-1):
    sample_row_df = sample_df[(sample_df["sentenceLength"]>=BINS[i]) & (sample_df["sentenceLength"]<BINS[i+1])]
    bin_proportion = (sample_row_df.shape[0])/float(LEN_SAMPLE_DF)
    n_reviews_to_generate = int((bin_proportion)*N_REVIEWS)+1
    try:
        or_df, cg_df =  train_test_split(sample_row_df, test_size=0.5, random_state=2021)
    except Exception as e:
        print(e)
        continue
    sample_reviews_df = cg_df.sample(n=n_reviews_to_generate,random_state=2021)
    or_sample_reviews_df = or_df.sample(n=n_reviews_to_generate,random_state=2021)
    if BINS[i] in range(10,21):
        max_len = 20
    else:
        max_len = BINS[i]
    for index, row in sample_reviews_df.iterrows():
        text = row["reviewText"]
        prompt = (" ").join(text.strip().split(" ")[:N_INITIAL_WORDS])
        """prompt_ids = tokenizer.encode(prompt)
        inp = tensor(prompt_ids)[None].cuda()"""
        # set seed to reproduce results
        #random_seed(seed_value=2021, use_cuda=False)

        #insert your gpt3 code here:
        pred = openai.Completion.create(
          model=MODEL,
          prompt=prompt,
          max_tokens=max_len,
          temperature=0.7,
          top_p=0.92
        )
        decoded_preds = prompt + pred.choices[0].text
        """preds = learn.model.generate(inp, max_length=max_len, do_sample=True, top_k=0, top_p=0.92, temperature=0.7)
        decoded_preds = tokenizer.decode(preds[0],skip_special_tokens=True)"""
        generated_df_rows.append([row["category"],row["overall"],decoded_preds,"CG"])
        time.sleep(2.4)
    for index, row in or_sample_reviews_df.iterrows():
        generated_df_rows.append([row["category"],row["overall"],row["reviewText"],"OR"])
#     break
generated_df = pd.DataFrame(generated_df_rows, columns=["category", "rating", "text", "label"])

In [33]:
generated_df.shape

(1042, 4)

In [34]:
generated_df

Unnamed: 0,category,rating,text,label
0,Restaurants,4.0,Excellent food. Large servings. Great burger...,CG
1,Restaurants,4.0,best chicken with green and yellow sauce cilan...,OR
2,Restaurants,4.0,"Hit the spot. Great food, friendly servers. ...",CG
3,Restaurants,4.5,"Consider yourself lucky if you can get one, bo...",OR
4,Restaurants,3.5,"Well, I thought it was awesome. The restaurant...",CG
...,...,...,...,...
1037,Restaurants,4.0,People think it's a stuffy place - not at all ...,OR
1038,Restaurants,4.0,"When I visit my sister, I always bring my cust...",CG
1039,Restaurants,4.5,"This is still a regular stop for us, but it's ...",CG
1040,Restaurants,3.0,This place just recently changed ownership and...,OR


In [35]:
generated_df.to_csv("Generated_Sentences_RestaurantGPT3UnProcessed.csv", index=None)

In [36]:
%cp /content/Generated_Sentences_RestaurantGPT3UnProcessed.csv /content/drive/MyDrive/YelpDataset/generatedData/GPT3

In [37]:
def post_process(str, k=4):
    if not str.endswith("."):
        full_stop_split_str = str.split(".")
        clean_full_stop_split_str = [s.strip() for s in full_stop_split_str if s.strip()]
        if len(clean_full_stop_split_str) > 1:
            last_sentence = clean_full_stop_split_str[-1]
        else:
            return str
        last_sentence_len = len(last_sentence.split(" "))
        if last_sentence_len > k:
            return str
        return (". ").join(clean_full_stop_split_str[:-1]).strip()+(".")
    return str

In [38]:
generated_df['text_'] = generated_df['text'].apply(lambda x: post_process(x))

In [39]:
generated_df.head()

Unnamed: 0,category,rating,text,label,text_
0,Restaurants,4.0,Excellent food. Large servings. Great burger...,CG,Excellent food. Large servings. Great burger...
1,Restaurants,4.0,best chicken with green and yellow sauce cilan...,OR,best chicken with green and yellow sauce cilan...
2,Restaurants,4.0,"Hit the spot. Great food, friendly servers. ...",CG,"Hit the spot. Great food, friendly servers. Pr..."
3,Restaurants,4.5,"Consider yourself lucky if you can get one, bo...",OR,"Consider yourself lucky if you can get one, bo..."
4,Restaurants,3.5,"Well, I thought it was awesome. The restaurant...",CG,"Well, I thought it was awesome. The restaurant..."


In [40]:
generated_df.drop(columns=["text"],axis=1,inplace=True)

In [41]:
generated_df

Unnamed: 0,category,rating,label,text_
0,Restaurants,4.0,CG,Excellent food. Large servings. Great burger...
1,Restaurants,4.0,OR,best chicken with green and yellow sauce cilan...
2,Restaurants,4.0,CG,"Hit the spot. Great food, friendly servers. Pr..."
3,Restaurants,4.5,OR,"Consider yourself lucky if you can get one, bo..."
4,Restaurants,3.5,CG,"Well, I thought it was awesome. The restaurant..."
...,...,...,...,...
1037,Restaurants,4.0,OR,People think it's a stuffy place - not at all ...
1038,Restaurants,4.0,CG,"When I visit my sister, I always bring my cust..."
1039,Restaurants,4.5,CG,"This is still a regular stop for us, but it's ..."
1040,Restaurants,3.0,OR,This place just recently changed ownership and...


In [42]:
generated_df.to_csv("Generated_Sentences_RestaurantGPT3.csv", index=None)

In [44]:
%cp /content/Generated_Sentences_RestaurantGPT3.csv /content/drive/MyDrive/YelpDataset/generatedData/GPT3