In [19]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv('AI.csv')
df.shape

(503, 2)

In [3]:
df.head()

Unnamed: 0,Question,Answer
0,Who did the first work generally recognized as...,Warren McCulloch and Walter Pitts (1943).\n
1,What sources was drawn on the formation of the...,knowledge of the basic physiology and function...
2,Who created the Hebbian learning rule?,Donald Hebb (1949).\n
3,When the first neural network is built?,1950.\n
4,What is the first neural network called?,The SNARC.\n


`EDA`

In [7]:
df.isnull().sum()

Question    0
Answer      0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

In [6]:
df['Answer'][0]

'Warren McCulloch and Walter Pitts (1943).\n'

* clean data

In [14]:
def clean_dataframe(df):
    df['Question'] = df['Question'].str.strip()  # إزالة المسافات الزائدة في البداية والنهاية
    df['Answer'] = df['Answer'].str.strip().str.replace(r'\n', '', regex=True) 
    return df

In [15]:
# Apply function
cleaned_df = clean_dataframe(df)

In [18]:
# for test after cleaning
cleaned_df['Answer'][0]

'Warren McCulloch and Walter Pitts (1943).'

In [20]:

# تحميل GPT-2
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# وظيفة لتوليد الردود
def generate_response(question, df, max_length=50):
    """
    توليد الردود على الأسئلة باستخدام GPT-2 أو البحث في البيانات.
    
    Args:
        question (str): السؤال المدخل.
        df (pd.DataFrame): البيانات التي تحتوي على الأسئلة والإجابات.
        max_length (int): الحد الأقصى لطول النص الناتج.
    
    Returns:
        str: الإجابة عن السؤال.
    """
    # إذا كانت الإجابة موجودة
    if question in df['Question'].values:
        return df.loc[df['Question'] == question, 'Answer'].values[0]
    
    # إذا لم تكن الإجابة موجودة، استخدم GPT-2
    prompt = f"Q: {question}\nA:"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Test before go streamlit

In [21]:
test_questions = [
    "Who created the Hebbian learning rule?",  # موجود في البيانات
    "What is AI?",  # غير موجود في البيانات
]

# عرض النتائج
for question in test_questions:
    response = generate_response(question, df)
    print(f"Question: {question}")
    print(f"Answer: {response}\n")

Question: Who created the Hebbian learning rule?
Answer: Donald Hebb (1949).

Question: What is AI?
Answer: Artificial Intelligence (AI) is the part of computer science concerned with designing intelligent computer systems.

