# October 2023 Questions

In [1]:
import pandas as pd
import numpy as np

## Objective:
### Background:
As a principal data scientist, you often work with semi-structured data like XML. This XML data represents reviews from customers about a product, where each review has an associated rating and text content. 

### Question:
Write a Python function that:
1. Parses the XML data to extract reviews.
2. Uses regex to remove any non-alphanumeric characters (excluding spaces) from the review text.
3. Uses NLP techniques to tokenize the cleaned review text into words.
4. Returns a dictionary where keys are unique words and values are the frequency of the word across all reviews.

### Inputs:
- XML string containing multiple 'review' tags. 
  - Each 'review' tag has two attributes: 'rating' (a number between 1 and 5) and 'text' (the actual review content).

### Outputs:
- A dictionary with:
  - keys: unique words (strings)
  - values: frequency of the word across all reviews (int).

### Libraries Needed:
- xml.etree.ElementTree
- re
- nltk.tokenize


In [2]:
## Data:
xml_data = """
<data>
    <review rating="5">I love this product! It's fantastic.</review>
    <review rating="3">It's okay, not the best. Could be better.</review>
    <review rating="1">Really bad product. Don't buy!</review>
    <review rating="4">Pretty good, but there's room for improvement.</review>
    <review rating="2">Meh, I've had better products before.</review>
    <review rating="3">It's decent. Worth the price.</review>
    <review rating="5">Amazing! Highly recommend to everyone.</review>
</data>
"""


In [3]:
# craft a function to parse the xml data to extract reviews and ratings
def parse_xml(xml_data):
    """
    Parse XML data to extract reviews and ratings.
    """
    return pd.read_xml(xml_data, xpath=".//review", parser="lxml")

In [4]:
parse_xml(xml_data)

Unnamed: 0,rating,review
0,5,I love this product! It's fantastic.
1,3,"It's okay, not the best. Could be better."
2,1,Really bad product. Don't buy!
3,4,"Pretty good, but there's room for improvement."
4,2,"Meh, I've had better products before."
5,3,It's decent. Worth the price.
6,5,Amazing! Highly recommend to everyone.


In [5]:
#regex to remove non-alphanumeric characters
import re
def remove_non_alphanumeric(text):
    """
    Remove non-alphanumeric characters from text.
    """
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [10]:
#apply remove_non_alphanumeric function to the reviews column
def clean_reviews(df):
    """
    Clean reviews column in df.
    """
    df['review'] = df['review'].apply(remove_non_alphanumeric)
    return df

In [11]:
clean_reviews(parse_xml(xml_data))

Unnamed: 0,rating,review
0,5,I love this product Its fantastic
1,3,Its okay not the best Could be better
2,1,Really bad product Dont buy
3,4,Pretty good but theres room for improvement
4,2,Meh Ive had better products before
5,3,Its decent Worth the price
6,5,Amazing Highly recommend to everyone


In [12]:
# Make a dictionary across the ratings column, the keys will be each word mentioned in reviews, and the values will be the number of times that word was mentioned in reviews
def count_words(df):
    """
    Count the number of times each word is mentioned in reviews.
    """
    word_count = {}
    for row in df.itertuples():
        for word in row.review.split():
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count

In [13]:
count_words(clean_reviews(parse_xml(xml_data)))

{'I': 1,
 'love': 1,
 'this': 1,
 'product': 2,
 'Its': 3,
 'fantastic': 1,
 'okay': 1,
 'not': 1,
 'the': 2,
 'best': 1,
 'Could': 1,
 'be': 1,
 'better': 2,
 'Really': 1,
 'bad': 1,
 'Dont': 1,
 'buy': 1,
 'Pretty': 1,
 'good': 1,
 'but': 1,
 'theres': 1,
 'room': 1,
 'for': 1,
 'improvement': 1,
 'Meh': 1,
 'Ive': 1,
 'had': 1,
 'products': 1,
 'before': 1,
 'decent': 1,
 'Worth': 1,
 'price': 1,
 'Amazing': 1,
 'Highly': 1,
 'recommend': 1,
 'to': 1,
 'everyone': 1}

In [14]:
def sort_dict(word_count_dict):
    """
    Sort dictionary by values.
    """
    return sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)

In [16]:
def count_words_from_xml(xml_data):
    """
    Count the number of times each word is mentioned in reviews from XML data.
    """
    return sort_dict(count_words(clean_reviews(parse_xml(xml_data))))

In [17]:
count_words_from_xml(xml_data)

[('Its', 3),
 ('product', 2),
 ('the', 2),
 ('better', 2),
 ('I', 1),
 ('love', 1),
 ('this', 1),
 ('fantastic', 1),
 ('okay', 1),
 ('not', 1),
 ('best', 1),
 ('Could', 1),
 ('be', 1),
 ('Really', 1),
 ('bad', 1),
 ('Dont', 1),
 ('buy', 1),
 ('Pretty', 1),
 ('good', 1),
 ('but', 1),
 ('theres', 1),
 ('room', 1),
 ('for', 1),
 ('improvement', 1),
 ('Meh', 1),
 ('Ive', 1),
 ('had', 1),
 ('products', 1),
 ('before', 1),
 ('decent', 1),
 ('Worth', 1),
 ('price', 1),
 ('Amazing', 1),
 ('Highly', 1),
 ('recommend', 1),
 ('to', 1),
 ('everyone', 1)]

---