### 1. Grab original text from Wikipedia

We will first need to create a personal API token by following the tutorial: [Wikimedia API Portal](https://api.wikimedia.org/wiki/Documentation/Getting_started). Noted that the rate limit for personal API token is limited to 5,000 requests per hour.

In [None]:
import requests
def get_response(search_query, language_code = 'en', number_of_results = 1):
  '''
  search query is the topic that we want to search
  search_query = 'Obama' # example

  language code is telling us what language we want it to search in. 
  language_code = 'en' # example

  number_of_results is the number of url results that we want
  number_of_results = 1 # example
  '''
  headers = {
    'Authorization': '',
    'User-Agent': 'ChatGPT-Detection-Project'
  }

  base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
  endpoint = '/search/page'
  url = base_url + language_code + endpoint
  parameters = {'q': search_query, 'limit': number_of_results}
  response = requests.get(url, headers=headers, params=parameters)
  return response

In [None]:
# Get article title, description, and URL from the search results
import json
def get_url(response, language_code):
  '''
  input: response from get_response function & language code
  output: url list
  '''
  response = json.loads(response.text)
  urls = []

  for page in response['pages']:
    display_title = page['title']
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key'] # get the url of the wikipedia page
    urls.append(article_url)
    # try:
    #   article_description = page['description']
    # except:
    #   article_description = 'a Wikipedia article'
      
  return urls

In [None]:
def clean_text(text):
    """ 
    remove parentheses in text and also the contents within parentheses
    """
    cleaned = ''
    paren = 0
    for i in text:
        if i == '(':
            paren += 1
        elif i == ')':
            paren -= 1
        elif paren ==0 and i != ')':
            cleaned += i
    return cleaned

In [None]:
from bs4 import BeautifulSoup
import re

def grab_article(urls, limit = 100):
    '''
    urls: List
    urls are grabbed by get_url function

    return type can have two individual input: "one" or "multiple"

    when return_type = 'one':
    start is the paragraph number that we want
    
    when return_type = 'multiple'
    start is the starting paragraph number
    end is the ending paragraph number
    '''
    articles = []
    for article_url in urls:
        response = requests.get(article_url)
        html_content = response.content

        # to bs obj
        soup = BeautifulSoup(html_content, 'html.parser')

        
        body_content = soup.find('div', {'id': 'bodyContent'})
        # Extract the paragraphs of the main text
        paragraphs = body_content.find_all('p')

        # Join the paragraphs together into a single string
        main_text = '\n'.join([p.text for p in paragraphs])

        article_text = re.sub(r'\[.*?\]', '', main_text)

        article = article_text.split('\n') 
        # for elem in article:
        #     if (len(elem) <300) :
        #         article.remove(elem)
        if len(article[0]) > limit:
            articles.append(clean_text(article[0].strip()))

    return articles

### 2. OpenAI API: paraphrase text

In [None]:
#use openai api to rephrase the text
import openai

# Set up the OpenAI API client
openai.api_key = "Secret"

# Define a function to rephrase a given text
def rephrase_text(text):
    '''
    input: human generated text
    output: AI rephrased text
    '''
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=(f"Rephrase the first section from:\n{text}"),
        temperature = 1.3,
        max_tokens=2048,
        
        stop=None,
        timeout=15,
    )

    #Extract text
    rephrased_text = response.choices[0].text

    return rephrased_text.strip("\n") # get rid off \n at the beginning of text


### 3. Main code to collect the data

#### Important notice:

Due to the character limit we set in `grab_article`, the rate of sucessfully filtered text is slightly low. Therefore, it is recommended to increase `number_of_results` in the `get_both_text`.

ex: 
search queries = ['television', 'game'], language_code = 'en', number_of_results = 20 (20 searchs for each topic in search queries, so 40 in total)

limit = 100

number of text after filtering is 12

In [None]:
import pandas as pd
import time
def get_both_text(search_queries, language_code = 'en', number_of_results = 1):
    '''
    this function used all the previous function and give you both AI generated text and human generated text

    function get_response:

        search queries is list of the topics that we want to search
        search_queries = ['Obama'] # example

        language code is telling us what language we want it to search in. 
        language_code = 'en' # example

        number_of_results is the number of url results that we want for each topic in search query
        number_of_results = 1 # example and this will only give 1 url for each element in search queries

    function grab_article: 

        urls is list of wikipedia urls we obtained and want to grab the first paragraph

        limit is number of minimum character the text must have.
        limit = 100 # by default

    return format: Human_Generated, AI_generated
    '''
    
    ai_generated_text = []
    human_generated_text = []

    for query in search_queries:
        print(f'topic is: {query}')
        response = get_response(query,language_code = language_code, number_of_results = number_of_results) # get the response from wikipedia
        url_list = get_url(response,language_code = language_code) # get all the urls 
        print("urls: \n", url_list)
        articles = grab_article(url_list, limit = 100) # get all the article with the relative paragraph number, now everything running with default
        human_generated_text.extend(articles)
        for i in articles:
            rephrased_text = rephrase_text(i)
            time.sleep(5)
            ai_generated_text.append(rephrased_text)

    # pd.DataFrame({'Human': [article], 'AI': [rephrased_text]}) # if you want it in a dataframe, you can start with editing this code :)
    
    return human_generated_text, ai_generated_text # return human_generated, ai_generated
    

In [None]:
topic_lsts = ['Air sports','American football','Association football','Auto racing','Baseball','Basketball','Boating','Boxing','Canoeing','Cricket','Cycling','Exercise','Fishing','Golf','Gymnastics','Hobbies','Horse racing','Ice hockey','Lacrosse','Olympic Games','Rugby league','Rugby union','Sailing','Skiing','Swimming','Tennis','Track and field','Walking trails','Water sports','Whitewater sports']
print(f"There are in total {len(topic_lsts)} topics")
human_generated, ai_generated = get_both_text(topic_lsts,language_code = 'en', number_of_results = 100)

In [None]:
# human_generated

In [None]:
# ai_generated

In [None]:
len(human_generated)

### 4. Write it to tsv file

path : main_page/data

In [None]:
import csv

path = "../data/parallel_text_3.tsv"

with open(path, 'w+') as write_tsv:
    writer = csv.writer(write_tsv, delimiter='\t') 
    writer.writerow(['human_text', 'ai_text'])
    for h, a in zip(human_generated, ai_generated):
        writer.writerow([h,a])

In [None]:
df = pd.read_csv(path, sep='\t')
df

In [None]:
df['human_text'][733]

In [None]:
df['ai_text'][733]