In [1]:
# to interact with files we use os module
import os

In [2]:
# check the present working directory

os.getcwd()

'/home/ehp/Documents/Jupyter-Notebook/jovian_data_analysis'

In [3]:
# To get the list of files in a directory, use os.listdir. 
# You pass an absolute or relative path of a directory as the argument to the function.

os.listdir('.')

['data',
 '5-functions.ipynb',
 '.ipynb_checkpoints',
 'Processing data from files.ipynb']

In [4]:
os.listdir(os.getcwd())

['data',
 '5-functions.ipynb',
 '.ipynb_checkpoints',
 'Processing data from files.ipynb']

In [5]:
os.listdir('/usr/')

['bin',
 'lib64',
 'libx32',
 'games',
 'src',
 'share',
 'lib32',
 'lib',
 'include',
 'libexec',
 'sbin',
 'local']

In [6]:
# Create new directory
# You can create a new directory using os.makedirs. 
# Let's create a new directory called data, where we'll later download some files.

os.makedirs(name='./data', exist_ok=True)

In [7]:
# Let's verify that the directory was created and is currently empty.
if 'data' in os.listdir('.'):
    print('data already exists')
else:
    print('No, data doesn\'t exist. you should created before countiniued!')

data already exists


In [8]:
# Let us download some files into the data directory using the urllib module.
url1 = 'https://gist.githubusercontent.com/aakashns/257f6e6c8719c17d0e498ea287d1a386/raw/7def9ef4234ddf0bc82f855ad67dac8b971852ef/loans1.txt'
url2 = 'https://gist.githubusercontent.com/aakashns/257f6e6c8719c17d0e498ea287d1a386/raw/7def9ef4234ddf0bc82f855ad67dac8b971852ef/loans2.txt'
url3 = 'https://gist.githubusercontent.com/aakashns/257f6e6c8719c17d0e498ea287d1a386/raw/7def9ef4234ddf0bc82f855ad67dac8b971852ef/loans3.txt'

In [9]:
import urllib.request

In [10]:
urls = [url1, url2, url3]

for i, url in enumerate(urls, start=1):
    urllib.request.urlretrieve(url, './data/loans'+str(i)+'.txt')
    

In [11]:
os.listdir('./data/')

['loans3.txt', 'loans1.txt', '.ipynb_checkpoints', 'loans2.txt']

In [12]:
# Reading files

with open('./data/loans2.txt', 'r', encoding='utf-8') as file2:
    file2_contents = file2.read()
    print(file2_contents)

amount,duration,rate,down_payment
828400,120,0.11,100000
4633400,240,0.06,
42900,90,0.08,8900
983000,16,0.14,
15230,48,0.07,4300


In [13]:
# Reading a file line by line
# File objects provide a readlines method to read a file line-by-line.
with open('./data/loans1.txt', 'r', encoding='utf-8') as file1:
    file1_content = file1.readlines()
    print(file1_content)

['amount,duration,rate,down_payment\n', '100000,36,0.08,20000\n', '200000,12,0.1,\n', '628400,120,0.12,100000\n', '4637400,240,0.06,\n', '42900,90,0.07,8900\n', '916000,16,0.13,\n', '45230,48,0.08,4300\n', '991360,99,0.08,\n', '423000,27,0.09,47200']


In [14]:
file2_contents

'amount,duration,rate,down_payment\n828400,120,0.11,100000\n4633400,240,0.06,\n42900,90,0.08,8900\n983000,16,0.14,\n15230,48,0.07,4300'

### Processing data from files

Before performing any operations on the data stored in a file, we need to convert the file's contents from one large string into Python data types. For the file `loans1.txt` containing information about loans in a CSV format, we can do the following:

* Read the file line by line
* Parse the first line to get a list of the column names or headers
* Split each remaining line and convert each value into a float
* Create a dictionary for each loan using the headers as keys
* Create a list of dictionaries to keep track of all the loans

Since we will perform the same operations for multiple files, it would be useful to define a function `read_csv`. We'll also define some helper functions to build up the functionality step by step. 

Let's start by defining a function `parse_header` that takes a line as input and returns a list of column headers.

In [15]:
def parse_header(line):
    return line.strip().split(',')

In [16]:
parse_header(file1_content[0])

['amount', 'duration', 'rate', 'down_payment']

In [17]:
def strip_values(line):
    return line.strip()
strip_values(file1_content[4])

'4637400,240,0.06,'

In [18]:
def convert_to_float(line):
    
    #line = strip_values(line)
    #line = line.split(',')
    line = [float(l) if l != '' else 'None' for l in line.strip().split(',')]
    return line
    
for i in file1_content[1:]:
    print(convert_to_float(i))


[100000.0, 36.0, 0.08, 20000.0]
[200000.0, 12.0, 0.1, 'None']
[628400.0, 120.0, 0.12, 100000.0]
[4637400.0, 240.0, 0.06, 'None']
[42900.0, 90.0, 0.07, 8900.0]
[916000.0, 16.0, 0.13, 'None']
[45230.0, 48.0, 0.08, 4300.0]
[991360.0, 99.0, 0.08, 'None']
[423000.0, 27.0, 0.09, 47200.0]


In [19]:
file1_content[1:]

['100000,36,0.08,20000\n',
 '200000,12,0.1,\n',
 '628400,120,0.12,100000\n',
 '4637400,240,0.06,\n',
 '42900,90,0.07,8900\n',
 '916000,16,0.13,\n',
 '45230,48,0.08,4300\n',
 '991360,99,0.08,\n',
 '423000,27,0.09,47200']

In [20]:
loan1_header = parse_header(file1_content[0])
loan1_content = [convert_to_float(i) for i in file1_content[1:]]

print(loan1_header)
print(loan1_content)

['amount', 'duration', 'rate', 'down_payment']
[[100000.0, 36.0, 0.08, 20000.0], [200000.0, 12.0, 0.1, 'None'], [628400.0, 120.0, 0.12, 100000.0], [4637400.0, 240.0, 0.06, 'None'], [42900.0, 90.0, 0.07, 8900.0], [916000.0, 16.0, 0.13, 'None'], [45230.0, 48.0, 0.08, 4300.0], [991360.0, 99.0, 0.08, 'None'], [423000.0, 27.0, 0.09, 47200.0]]


In [21]:
# create a dict for each loan

def loan_dict(file):
    header = parse_header(file[0])
    lst = file[1:]
    # lst = [convert_to_float(i) for i in file[1:]]
    loan = {}
    
    for i in header:
        for j in lst:
            a = convert_to_float(j)
            for k in a:
                loan[i] = k
    print(loan)
    print('+' * 12)
    return loan

    
loan_dict(file1_content)

{'amount': 47200.0, 'duration': 47200.0, 'rate': 47200.0, 'down_payment': 47200.0}
++++++++++++


{'amount': 47200.0,
 'duration': 47200.0,
 'rate': 47200.0,
 'down_payment': 47200.0}

In [22]:
file = file1_content
header = parse_header(file[0])
#lst = [convert_to_float(i) for i in file[1:]]
lst = convert_to_float(file[4])
print(header)
print(lst)

['amount', 'duration', 'rate', 'down_payment']
[4637400.0, 240.0, 0.06, 'None']


In [23]:
lan_dict = {}

for k,v in zip(header, lst):
    lan_dict[k] = v
lan_dict

{'amount': 4637400.0, 'duration': 240.0, 'rate': 0.06, 'down_payment': 'None'}

In [24]:
def create_loan_dict(file):
    loan_dict = {}
    loans = []
    header = parse_header(file[0])
    lst = [convert_to_float(i) for i in file[1:]]
    for i in lst:
        for k,v in zip(header, i):
            loan_dict[k] = v
            loan_dict_copy = loan_dict.copy()
        loans.append(loan_dict_copy)
        # print(loan_dict)
        
    
    return loans

file = file1_content
create_loan_dict(file)
    

[{'amount': 100000.0, 'duration': 36.0, 'rate': 0.08, 'down_payment': 20000.0},
 {'amount': 200000.0, 'duration': 12.0, 'rate': 0.1, 'down_payment': 'None'},
 {'amount': 628400.0,
  'duration': 120.0,
  'rate': 0.12,
  'down_payment': 100000.0},
 {'amount': 4637400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 'None'},
 {'amount': 42900.0, 'duration': 90.0, 'rate': 0.07, 'down_payment': 8900.0},
 {'amount': 916000.0, 'duration': 16.0, 'rate': 0.13, 'down_payment': 'None'},
 {'amount': 45230.0, 'duration': 48.0, 'rate': 0.08, 'down_payment': 4300.0},
 {'amount': 991360.0, 'duration': 99.0, 'rate': 0.08, 'down_payment': 'None'},
 {'amount': 423000.0, 'duration': 27.0, 'rate': 0.09, 'down_payment': 47200.0}]

In [25]:
with open('./data/loans3.txt', 'r', encoding='utf-8') as file3:
    file3_content = file3.readlines()

print(file3_content)

['amount,duration,rate,down_payment\n', '45230,48,0.07,4300\n', '883000,16,0.14,\n', '100000,12,0.1,\n', '728400,120,0.12,100000\n', '3637400,240,0.06,\n', '82900,90,0.07,8900\n', '316000,16,0.13,\n', '15230,48,0.08,4300\n', '991360,99,0.08,\n', '323000,27,0.09,4720010000,36,0.08,20000\n', '528400,120,0.11,100000\n', '8633400,240,0.06,\n', '12900,90,0.08,8900']


In [26]:
file = file3_content
create_loan_dict(file)

[{'amount': 45230.0, 'duration': 48.0, 'rate': 0.07, 'down_payment': 4300.0},
 {'amount': 883000.0, 'duration': 16.0, 'rate': 0.14, 'down_payment': 'None'},
 {'amount': 100000.0, 'duration': 12.0, 'rate': 0.1, 'down_payment': 'None'},
 {'amount': 728400.0,
  'duration': 120.0,
  'rate': 0.12,
  'down_payment': 100000.0},
 {'amount': 3637400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 'None'},
 {'amount': 82900.0, 'duration': 90.0, 'rate': 0.07, 'down_payment': 8900.0},
 {'amount': 316000.0, 'duration': 16.0, 'rate': 0.13, 'down_payment': 'None'},
 {'amount': 15230.0, 'duration': 48.0, 'rate': 0.08, 'down_payment': 4300.0},
 {'amount': 991360.0, 'duration': 99.0, 'rate': 0.08, 'down_payment': 'None'},
 {'amount': 323000.0,
  'duration': 27.0,
  'rate': 0.09,
  'down_payment': 4720010000.0},
 {'amount': 528400.0,
  'duration': 120.0,
  'rate': 0.11,
  'down_payment': 100000.0},
 {'amount': 8633400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 'None'},
 {'amount'

In [27]:
with open('./data/loans2.txt', 'r', encoding='utf-8') as file2:
    file2_content = file2.readlines()

print(file2_content)

['amount,duration,rate,down_payment\n', '828400,120,0.11,100000\n', '4633400,240,0.06,\n', '42900,90,0.08,8900\n', '983000,16,0.14,\n', '15230,48,0.07,4300']


In [28]:
file = file2_content
create_loan_dict(file)

[{'amount': 828400.0,
  'duration': 120.0,
  'rate': 0.11,
  'down_payment': 100000.0},
 {'amount': 4633400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 'None'},
 {'amount': 42900.0, 'duration': 90.0, 'rate': 0.08, 'down_payment': 8900.0},
 {'amount': 983000.0, 'duration': 16.0, 'rate': 0.14, 'down_payment': 'None'},
 {'amount': 15230.0, 'duration': 48.0, 'rate': 0.07, 'down_payment': 4300.0}]