## Importing Necessary Libraries

In [2]:
import re
import nltk
import warnings
import pandas as pd
import seaborn as sns
from tqdm import tqdm 
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300
warnings.filterwarnings("ignore")

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading the Data

In [8]:

data = pd.read_csv('NvidiaDocumentationQandApairs_Short_huge.csv', encoding='UTF-8')

# Keeping only 'question_title' and 'short_answer' columns, dropping the rest
data = data[['question', 'short_answer']].reset_index(drop=True)

# Displaying the first few rows 
print("First few rows of Data:")
data
display(data)


First few rows of Data:


Unnamed: 0,question,short_answer
0,What is Hybridizer?,Hybridizer is a compiler for programming GPUs ...
1,How does Hybridizer generate optimized code?,Hybridizer generates optimized code using deco...
2,What are some parallelization patterns mention...,The text mentions parallelization patterns lik...
3,How can you benefit from accelerators without ...,Use patterns like Parallel.For or CUDA-like di...
4,What is an example of using Hybridizer?,Using Parallel.For with a lambda to utilize th...
...,...,...
7103,What is the focus of the GTC event in 2015?,The 2015 GTC event focused on GPU code optimiz...
7104,How were the main changes made to the code for...,"The code was optimized by merging kernels, reg..."
7105,What are some key fields in the cudaDeviceProp...,The key fields in the cudaDeviceProp struct ar...
7106,What did changing the kernel approach achieve ...,The kernel approach change reduced the iterati...


## Statistical Analysis

In [9]:
# Displaying the data types for the data
print("Data Types in Train Data:")
display(data.dtypes)


Data Types in Train Data:


question        object
short_answer    object
dtype: object

In [10]:
# Checking for null values in the dataset, sorted in descending order
print("Null Values in Train Data, Sorted:")
print(data.isnull().sum().sort_values(ascending=False))


Null Values in Train Data, Sorted:
question        0
short_answer    0
dtype: int64


In [11]:
# Checking for exact duplicated rows in the dataset
duplicated_rows_count = data.duplicated().sum()
print("Exact Duplicated Rows in Data:", duplicated_rows_count)
if duplicated_rows_count > 0:
    # If there are duplicated rows, print them
    print("\nDuplicated Rows in the Data:")
    duplicated_train_rows = data[data.duplicated(keep=False)]
    display(duplicated_train_rows)

Exact Duplicated Rows in Data: 152

Duplicated Rows in the Data:


Unnamed: 0,question,short_answer
22,What architectures does CUDA 11.3 support?,"CUDA 11.3 supports NVIDIA Ampere, x86, Arm ser..."
401,What is the role of Tensor Cores in the Volta ...,Tensor Cores in Volta architecture are designe...
411,What is the role of Tensor Cores in the Volta ...,Tensor Cores in Volta architecture are designe...
662,What types of containers can you run within th...,You can run any NVIDIA Linux container within ...
683,What types of containers can you run within th...,You can run any NVIDIA Linux container within ...
...,...,...
6340,How does quantization simplify the tree constr...,Quantization simplifies tree construction in g...
6617,When might it be necessary to write custom CUD...,Custom CUDA code is needed for complex GPU tas...
6637,When might it be necessary to write custom CUD...,Custom CUDA code is needed for complex GPU tas...
7016,How can you reduce the performance impact of e...,Use preprocessor macros to include error check...


## Data Cleaning

In [13]:
# Dropping all rows with any null values in the Health-related train dataset
data = data.dropna().drop_duplicates().reset_index(drop=True)
# Dropping all rows with any null values in the Health-related test dataset


# Printing the new shape of the cleaned train and test datasets
print("New Shape of Cleaned Data:", data.shape)
data

New Shape of Cleaned Data: (6956, 2)


Unnamed: 0,question,short_answer
0,What is Hybridizer?,Hybridizer is a compiler for programming GPUs ...
1,How does Hybridizer generate optimized code?,Hybridizer generates optimized code using deco...
2,What are some parallelization patterns mention...,The text mentions parallelization patterns lik...
3,How can you benefit from accelerators without ...,Use patterns like Parallel.For or CUDA-like di...
4,What is an example of using Hybridizer?,Using Parallel.For with a lambda to utilize th...
...,...,...
6951,What is the focus of the GTC event in 2015?,The 2015 GTC event focused on GPU code optimiz...
6952,How were the main changes made to the code for...,"The code was optimized by merging kernels, reg..."
6953,What are some key fields in the cudaDeviceProp...,The key fields in the cudaDeviceProp struct ar...
6954,What did changing the kernel approach achieve ...,The kernel approach change reduced the iterati...


## Data Normalization

In [8]:
def clean_text(text):
    # Ensuring text is a string
    if not isinstance(text, str):
        text = str(text)
    text = text.split('\n')
    # Converting text to lowercase for uniformity
    text = text[0].lower()
    # Explicitly remove newline characters and carriage returns
    text = text.replace('\\n', ' ')  # Replace newline with space
    text = text.replace('\r', ' ')  # Replace carriage return with space
    # Remove HTML break tags and other common HTML elements
    text = re.sub(r'<br\s*/?>', ' ', text)  # Handle variations like <br> and <br />
    # Removing unwanted HTML tags
    html_tags = re.compile('<.*?>')  # Regex to match HTML tags
    text = re.sub(html_tags, ' ', text)  # Removing HTML tags
    # Removing URLs from text
    urls = re.compile(r'http[s]?://\S+')
    text = re.sub(urls, '', text)  # Removing URLs
    # Replacing multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)  # Replacing multiple spaces with a single space
    # Strip leading and trailing whitespace
    text = text.strip()
    return text  # Returning the cleaned text

# Applying the cleaning function to the 'question_title' and 'best_answer' columns
data['question'] = [clean_text(q) for q in tqdm(data['question'], desc='Cleaning Question Titles')]
data['answer'] = [clean_text(a) for a in tqdm(data['short_answer'], desc='Cleaning Best Answers')]



# Displaying the first few rows of the cleaned data to verify the cleaning process
print("Head of Cleaned Data:")
display(data[['question', 'answer']].head())  # Displaying the cleaned data

# Writing the cleaned train data to a new CSV file
data[['question', 'answer' ]].to_csv('Nvidia_cleaned.csv', index=False)

# Printing a message for successfully saved files
print("Files have been successfully saved.")

Cleaning Question Titles: 100%|██████████| 6956/6956 [00:00<00:00, 137443.02it/s]
Cleaning Best Answers: 100%|██████████| 6956/6956 [00:00<00:00, 141924.58it/s]

Head of Cleaned Data:





Unnamed: 0,question,answer
0,what is hybridizer?,hybridizer is a compiler for programming gpus and accelerators using c# or .net assembly.
1,how does hybridizer generate optimized code?,"hybridizer generates optimized code using decorated symbols for parallelism, designed for multic..."
2,what are some parallelization patterns mentioned in the text?,the text mentions parallelization patterns like parallel.for and distributing parallel work simi...
3,how can you benefit from accelerators without learning their internal architecture?,use patterns like parallel.for or cuda-like distribution for using accelerators without knowing ...
4,what is an example of using hybridizer?,using parallel.for with a lambda to utilize the computing power of accelerators is an example.


Files have been successfully saved.


In [11]:
import csv

# Save the CSV with every field enclosed in quotes
data.to_csv('Nvidia_cleaned_big_quoted.csv', quoting=csv.QUOTE_ALL, index=False, encoding='UTF-8')
