In [0]:
pip install nutter

In [0]:
%run /Users/vishnuas1987@gmail.com/PEI_Case_Study/Functions/PEI_transformation_functions

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType,BooleanType,LongType,DoubleType,FloatType
import runtime

In [0]:
from runtime.nutterfixture import NutterFixture,tag
class PEItestFixture(NutterFixture):
    #Test the function remove_non_alphabet
    def assertion_remove_non_alphabet(self):
        data = [(1,'Ad.       ..am Hart'),(2,'Tam&^*ara Willing___)ham'),(3,'Pete@#$ Takahito'),(4, 'johnXahiur')]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df = spark.createDataFrame(data,schema)
        data = [(1,'Adam Hart'),(2,'Tamara Willingham'),(3,'Pete Takahito'),(4, 'John Xahiur')]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df_expected = spark.createDataFrame(data,schema)
        df_actual = remove_non_alphabet(df,'Name')
        assert df_actual.collect() == df_expected.collect() 

    #test the function clean_phone_number
    def assertion_clean_phone_number(self):
        data = [(1,'001-581-945-5641'),
                (2,'(128)935-6357x6738'),
                (3,'713.315.6216x96588'),
                (4, '202-494-3717'),
                (5, '+1-556-9469745x3698'),
                (6, '563444'),
                (7, 'x563444'),
                (8, '-5566'),
                ]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df = spark.createDataFrame(data,schema)
        data = [(1,'+1(581)945-5641'),
                (2,'+1(128)935-6357x6738'),
                (3,'+1(713)315-6216x96588'),
                (4, '+1(202)494-3717'),
                (5,'+1(556)946-9745x3698'),
                (6, 'Invalid phone number'),
                (7, 'Invalid phone number'),
                (8, 'Invalid phone number')
                ]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df_expected = spark.createDataFrame(data,schema)
        df_actual = clean_phone_number(df,'Name')
        assert df_actual["Id","_cleaned_Name"].collect() == df_expected.collect()  

    #test the function to clean the product name
    def assertion_clean_product_name(self):
        data = [
             (1,'Tenex Chairmat w/ Average Lip, 45" x 53"'),\
             (2,'ImationÂ USB 2.0 SwivelÂ Flash DriveÂ USBÂ flash driveÂ - 4 GB - Pink'),\
             (3,'"While you Were Out" Message Book, One Form per Page'),\
             (4, 'ImationÂ 30456 USBÂ Flash DriveÂ 8GB'),\
             (5, 'Redi-Strip #10 Envelopes, 4 1/8 x 9 1/2')
             ]   
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df = spark.createDataFrame(data,schema)
        data = [
            (1,'Tenex Chairmat w/ Average Lip, 45" x 53"'),
            (2,'Imation USB 2.0 Swivel Flash Drive USB flash drive - 4 GB - Pink'),
            (3,'"While you Were Out" Message Book, One Form per Page'),
            (4, 'Imation 30456 USB Flash Drive 8GB'),
            (5, 'Redi-Strip #10 Envelopes, 4 1/8 x 9 1/2')
            ]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True)
            ]
        )
        df_expected = spark.createDataFrame(data,schema)      
        df_actual = clean_product_name(df,'Name')
        assert df_actual.collect() == df_expected.collect() 

    #test the function to enrich the name field
    def assertion_enrich_name(self):
        data = [
             (1,'Lisa Ryan',''),\
             (2,'','clarencehughes280@gmail.com'),\
             (3,'','bradleywilliams694'),\
             (4, '',''),\
             (5, '','@gmail.com')
             ]   
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True),\
                StructField("Email", StringType(), True)
            ]
        )
        df = spark.createDataFrame(data,schema)        
        data = [
            (1,'Lisa Ryan','',False),
            (2,'clarencehughes','clarencehughes280@gmail.com',True),
            (3,'bradleywilliams','bradleywilliams694',True),
            (4, '','',True),
            (5, '','@gmail.com',True)
            ]
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Name", StringType(), True),\
                StructField("Email", StringType(), True),\
                StructField("name_filled", BooleanType(), True)
            ]
        )
        df_expected = spark.createDataFrame(data,schema)    
        df_actual = enrich_name(df,'Name','Email')
        assert df_actual.collect() == df_expected.collect() 

    #test the function to aggregrate the dataframe
    def assertion_aggregate_dataframe(self):
        data = [
            (1,'2014','AB-10105','Office Supplies','Binders',103.992),
            (2,'2014','PF-19165','Furniture','Tables',124.4615385),
            (3,'2015','KB-16240','Furniture','Tables',124.4615385),
            (4, '2015','PF-19165','Furniture','Chairs',284.6666667),
            (5, '2015','PF-19165','Office Supplies','Paper',60.4),
            (6, '2015','PF-19165','Furniture','Chairs',284.6666667),
            ]  
        schema = StructType(
            [
                StructField("Id", IntegerType(), True), \
                StructField("Order_year", StringType(), True), \
                StructField("Customer_ID", StringType(), True),\
                StructField("Category", StringType(), True),\
                StructField("Sub_Category", StringType(), True),\
                StructField("Profit", DoubleType(), True)
            ]
        )
        df = spark.createDataFrame(data,schema)        
        data = [
            ('2014','AB-10105','Office Supplies','Binders',103.992),
            ('2014','PF-19165','Furniture','Tables',124.4615385),
            ('2015','KB-16240','Furniture','Tables',124.4615385),
            ('2015','PF-19165','Furniture','Chairs',569.3333334),
            ('2015','PF-19165','Office Supplies','Paper',60.4)
            ] 
        schema = StructType(
            [
                # StructField("Id", IntegerType(), True), \
                StructField("Order_year", StringType(), True), \
                StructField("Customer_ID", StringType(), True),\
                StructField("Category", StringType(), True),\
                StructField("Sub_Category", StringType(), True),\
                StructField("Total_Profit", DoubleType(), True)
            ]
        )
        df_expected = spark.createDataFrame(data,schema)   
        df_actual = aggregate_dataframe(df,"Order_year","Customer_ID","Category","Sub_Category","Profit")
        assert df_actual.collect() == df_expected.collect() 

result = PEItestFixture().execute_tests()
print(result.to_string())     


Notebook: N/A - Lifecycle State: N/A, Result: N/A
Run Page URL: N/A
PASSING TESTS
------------------------------------------------------------
aggregate_dataframe (1.088064380994183 seconds)
clean_phone_number (0.8460635490046116 seconds)
clean_product_name (0.7304145429952769 seconds)
enrich_name (0.7773054360004608 seconds)
remove_non_alphabet (0.8455801029922441 seconds)



